diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 9783c2b7a..ab09ea418 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -225,6 +225,8 @@ source code and documentation. - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c) - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c) - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c) + - [dev/x86_64/src/polyw1_pack_32_avx2.c](dev/x86_64/src/polyw1_pack_32_avx2.c) + - [dev/x86_64/src/polyw1_pack_88_avx2.c](dev/x86_64/src/polyw1_pack_88_avx2.c) - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c) - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c) - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c) @@ -243,6 +245,8 @@ source code and documentation. - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c) - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c) - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c) + - [mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c) + - [mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c) - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c) - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c) - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c) diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h index 2923b8c55..c1c1bd4db 100644 --- a/dev/aarch64_clean/meta.h +++ b/dev/aarch64_clean/meta.h @@ -21,6 +21,8 @@ #define MLD_USE_NATIVE_POLY_CHKNORM #define MLD_USE_NATIVE_POLYZ_UNPACK_17 #define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POLYW1_PACK_32 +#define MLD_USE_NATIVE_POLYW1_PACK_88 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 @@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf) #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ || MLD_CONFIG_PARAMETER_SET == 87 */ +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a) +{ + mld_polyw1_pack_32_asm(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ + +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/* Table of constants for polyw1_pack_88_asm: + * [0:15] v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s + * [16:31] v_tbl0: TBL indices for out0 from {v16, v17} + * [32:47] v_tbl1: TBL indices for out1 from {v17, v18} + * [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */ +/* clang-format off */ +MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = { + /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */ + 0, 0, 0, 0, 6, 0, 0, 0, 12, 0, 0, 0, 18, 0, 0, 0, + /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */ + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, + /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */ + 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, + /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */ + 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30, +}; +/* clang-format on */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a) +{ + mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ + MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], diff --git a/dev/aarch64_clean/src/arith_native_aarch64.h b/dev/aarch64_clean/src/arith_native_aarch64.h index f78a1487d..967884699 100644 --- a/dev/aarch64_clean/src/arith_native_aarch64.h +++ b/dev/aarch64_clean/src/arith_native_aarch64.h @@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf, void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, const uint8_t *indices); +#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm) +void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a); + +#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm) +void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table); + #define mld_poly_pointwise_montgomery_asm \ MLD_NAMESPACE(poly_pointwise_montgomery_asm) void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *, diff --git a/dev/aarch64_clean/src/polyw1_pack_32_asm.S b/dev/aarch64_clean/src/polyw1_pack_32_asm.S new file mode 100644 index 000000000..0598b71de --- /dev/null +++ b/dev/aarch64_clean/src/polyw1_pack_32_asm.S @@ -0,0 +1,112 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +/* simpasm: header-end */ + +/* + * polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32. + * + * Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word. + * Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4) + * 256 coefficients -> 128 output bytes. + * + * UZP1 narrowing chain (32->16->8 bit) extracts the low byte from + * each coefficient; UZP1/UZP2 separate even/odd coefficients; + * SLI shifts and inserts the odd nibbles. + * + * 4x unrolled, 2 iterations for 256 coefficients. + */ + + output .req x0 + input .req x1 + count .req x2 + +.text +.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm) + + mov count, #(256 / (32 * 4)) + +polyw1_pack_32_loop: + + /* Block 0: coefficients 0-31 */ + ldp q0, q1, [input], #512 + ldp q2, q3, [input, #(32 - 512)] + ldp q4, q5, [input, #(64 - 512)] + ldp q6, q7, [input, #(96 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v16.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v16.16b, v0.16b, #4 + + /* Block 1: coefficients 32-63 */ + ldp q0, q1, [input, #(128 - 512)] + ldp q2, q3, [input, #(160 - 512)] + ldp q4, q5, [input, #(192 - 512)] + ldp q6, q7, [input, #(224 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v17.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v17.16b, v0.16b, #4 + + /* Block 2: coefficients 64-95 */ + ldp q0, q1, [input, #(256 - 512)] + ldp q2, q3, [input, #(288 - 512)] + ldp q4, q5, [input, #(320 - 512)] + ldp q6, q7, [input, #(352 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v18.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v18.16b, v0.16b, #4 + + /* Block 3: coefficients 96-127 */ + ldp q0, q1, [input, #(384 - 512)] + ldp q2, q3, [input, #(416 - 512)] + ldp q4, q5, [input, #(448 - 512)] + ldp q6, q7, [input, #(480 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v19.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v19.16b, v0.16b, #4 + + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [output], #64 + + subs count, count, #1 + bne polyw1_pack_32_loop + + ret + + .unreq output + .unreq input + .unreq count +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ diff --git a/dev/aarch64_clean/src/polyw1_pack_88_asm.S b/dev/aarch64_clean/src/polyw1_pack_88_asm.S new file mode 100644 index 000000000..875fc177b --- /dev/null +++ b/dev/aarch64_clean/src/polyw1_pack_88_asm.S @@ -0,0 +1,126 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) +/* simpasm: header-end */ + +/* + * polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88. + * + * Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word. + * Pack 4 coefficients into 3 bytes: + * r[3i+0] = a[4i+0] | (a[4i+1] << 6) + * r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4) + * r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2) + * 256 coefficients -> 192 output bytes. + * + * Each group of 4 coefficients in a .4s vector is shifted to its + * bit position using USHL, then reduced with ADDP to form one + * 24-bit packed value per 32-bit lane. + * + * Three 2-register TBL instructions then extract the useful 3 bytes + * from each 32-bit lane across pairs of adjacent result vectors, + * producing 3 contiguous 16-byte output vectors (48 bytes total). + * + * 4x unrolled, 4 iterations for 256 coefficients. + */ + + output .req x0 + input .req x1 + table .req x2 + count .req x3 + + v_shifts .req v24 + v_tbl0 .req v25 + v_tbl1 .req v26 + v_tbl2 .req v27 + +.text +.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm) + + /* Load constants from table pointer (x2): + * [0:15] = v_shifts.4s = {0, 6, 12, 18} + * [16:31] = v_tbl0: TBL indices for out0 from {v16, v17} + * [32:47] = v_tbl1: TBL indices for out1 from {v17, v18} + * [48:63] = v_tbl2: TBL indices for out2 from {v18, v19} */ + ldp q24, q25, [table] + ldp q26, q27, [table, #32] + + mov count, #(256 / (16 * 4)) + +polyw1_pack_88_loop: + + /* Block 0: coefficients 0-15 */ + ldp q0, q1, [input], #256 + ldp q2, q3, [input, #(32 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v16.4s, v0.4s, v2.4s + + /* Block 1: coefficients 16-31 */ + ldp q0, q1, [input, #(64 - 256)] + ldp q2, q3, [input, #(96 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v17.4s, v0.4s, v2.4s + + /* Block 2: coefficients 32-47 */ + ldp q0, q1, [input, #(128 - 256)] + ldp q2, q3, [input, #(160 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v18.4s, v0.4s, v2.4s + + /* Block 3: coefficients 48-63 */ + ldp q0, q1, [input, #(192 - 256)] + ldp q2, q3, [input, #(224 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v19.4s, v0.4s, v2.4s + + /* Compact + splice into 3 output vectors */ + tbl v20.16b, {v16.16b, v17.16b}, v_tbl0.16b + tbl v21.16b, {v17.16b, v18.16b}, v_tbl1.16b + tbl v22.16b, {v18.16b, v19.16b}, v_tbl2.16b + + st1 {v20.16b, v21.16b, v22.16b}, [output], #48 + + subs count, count, #1 + bne polyw1_pack_88_loop + + ret + + .unreq output + .unreq input + .unreq table + .unreq count + .unreq v_shifts + .unreq v_tbl0 + .unreq v_tbl1 + .unreq v_tbl2 +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ diff --git a/dev/aarch64_opt/meta.h b/dev/aarch64_opt/meta.h index 2923b8c55..c1c1bd4db 100644 --- a/dev/aarch64_opt/meta.h +++ b/dev/aarch64_opt/meta.h @@ -21,6 +21,8 @@ #define MLD_USE_NATIVE_POLY_CHKNORM #define MLD_USE_NATIVE_POLYZ_UNPACK_17 #define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POLYW1_PACK_32 +#define MLD_USE_NATIVE_POLYW1_PACK_88 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 @@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf) #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ || MLD_CONFIG_PARAMETER_SET == 87 */ +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a) +{ + mld_polyw1_pack_32_asm(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ + +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/* Table of constants for polyw1_pack_88_asm: + * [0:15] v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s + * [16:31] v_tbl0: TBL indices for out0 from {v16, v17} + * [32:47] v_tbl1: TBL indices for out1 from {v17, v18} + * [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */ +/* clang-format off */ +MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = { + /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */ + 0, 0, 0, 0, 6, 0, 0, 0, 12, 0, 0, 0, 18, 0, 0, 0, + /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */ + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, + /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */ + 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, + /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */ + 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30, +}; +/* clang-format on */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a) +{ + mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ + MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], diff --git a/dev/aarch64_opt/src/arith_native_aarch64.h b/dev/aarch64_opt/src/arith_native_aarch64.h index f78a1487d..967884699 100644 --- a/dev/aarch64_opt/src/arith_native_aarch64.h +++ b/dev/aarch64_opt/src/arith_native_aarch64.h @@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf, void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, const uint8_t *indices); +#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm) +void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a); + +#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm) +void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table); + #define mld_poly_pointwise_montgomery_asm \ MLD_NAMESPACE(poly_pointwise_montgomery_asm) void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *, diff --git a/dev/aarch64_opt/src/polyw1_pack_32_asm.S b/dev/aarch64_opt/src/polyw1_pack_32_asm.S new file mode 100644 index 000000000..0598b71de --- /dev/null +++ b/dev/aarch64_opt/src/polyw1_pack_32_asm.S @@ -0,0 +1,112 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +/* simpasm: header-end */ + +/* + * polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32. + * + * Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word. + * Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4) + * 256 coefficients -> 128 output bytes. + * + * UZP1 narrowing chain (32->16->8 bit) extracts the low byte from + * each coefficient; UZP1/UZP2 separate even/odd coefficients; + * SLI shifts and inserts the odd nibbles. + * + * 4x unrolled, 2 iterations for 256 coefficients. + */ + + output .req x0 + input .req x1 + count .req x2 + +.text +.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm) + + mov count, #(256 / (32 * 4)) + +polyw1_pack_32_loop: + + /* Block 0: coefficients 0-31 */ + ldp q0, q1, [input], #512 + ldp q2, q3, [input, #(32 - 512)] + ldp q4, q5, [input, #(64 - 512)] + ldp q6, q7, [input, #(96 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v16.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v16.16b, v0.16b, #4 + + /* Block 1: coefficients 32-63 */ + ldp q0, q1, [input, #(128 - 512)] + ldp q2, q3, [input, #(160 - 512)] + ldp q4, q5, [input, #(192 - 512)] + ldp q6, q7, [input, #(224 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v17.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v17.16b, v0.16b, #4 + + /* Block 2: coefficients 64-95 */ + ldp q0, q1, [input, #(256 - 512)] + ldp q2, q3, [input, #(288 - 512)] + ldp q4, q5, [input, #(320 - 512)] + ldp q6, q7, [input, #(352 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v18.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v18.16b, v0.16b, #4 + + /* Block 3: coefficients 96-127 */ + ldp q0, q1, [input, #(384 - 512)] + ldp q2, q3, [input, #(416 - 512)] + ldp q4, q5, [input, #(448 - 512)] + ldp q6, q7, [input, #(480 - 512)] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v19.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v19.16b, v0.16b, #4 + + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [output], #64 + + subs count, count, #1 + bne polyw1_pack_32_loop + + ret + + .unreq output + .unreq input + .unreq count +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ diff --git a/dev/aarch64_opt/src/polyw1_pack_88_asm.S b/dev/aarch64_opt/src/polyw1_pack_88_asm.S new file mode 100644 index 000000000..875fc177b --- /dev/null +++ b/dev/aarch64_opt/src/polyw1_pack_88_asm.S @@ -0,0 +1,126 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) +/* simpasm: header-end */ + +/* + * polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88. + * + * Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word. + * Pack 4 coefficients into 3 bytes: + * r[3i+0] = a[4i+0] | (a[4i+1] << 6) + * r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4) + * r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2) + * 256 coefficients -> 192 output bytes. + * + * Each group of 4 coefficients in a .4s vector is shifted to its + * bit position using USHL, then reduced with ADDP to form one + * 24-bit packed value per 32-bit lane. + * + * Three 2-register TBL instructions then extract the useful 3 bytes + * from each 32-bit lane across pairs of adjacent result vectors, + * producing 3 contiguous 16-byte output vectors (48 bytes total). + * + * 4x unrolled, 4 iterations for 256 coefficients. + */ + + output .req x0 + input .req x1 + table .req x2 + count .req x3 + + v_shifts .req v24 + v_tbl0 .req v25 + v_tbl1 .req v26 + v_tbl2 .req v27 + +.text +.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm) + + /* Load constants from table pointer (x2): + * [0:15] = v_shifts.4s = {0, 6, 12, 18} + * [16:31] = v_tbl0: TBL indices for out0 from {v16, v17} + * [32:47] = v_tbl1: TBL indices for out1 from {v17, v18} + * [48:63] = v_tbl2: TBL indices for out2 from {v18, v19} */ + ldp q24, q25, [table] + ldp q26, q27, [table, #32] + + mov count, #(256 / (16 * 4)) + +polyw1_pack_88_loop: + + /* Block 0: coefficients 0-15 */ + ldp q0, q1, [input], #256 + ldp q2, q3, [input, #(32 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v16.4s, v0.4s, v2.4s + + /* Block 1: coefficients 16-31 */ + ldp q0, q1, [input, #(64 - 256)] + ldp q2, q3, [input, #(96 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v17.4s, v0.4s, v2.4s + + /* Block 2: coefficients 32-47 */ + ldp q0, q1, [input, #(128 - 256)] + ldp q2, q3, [input, #(160 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v18.4s, v0.4s, v2.4s + + /* Block 3: coefficients 48-63 */ + ldp q0, q1, [input, #(192 - 256)] + ldp q2, q3, [input, #(224 - 256)] + ushl v0.4s, v0.4s, v_shifts.4s + ushl v1.4s, v1.4s, v_shifts.4s + ushl v2.4s, v2.4s, v_shifts.4s + ushl v3.4s, v3.4s, v_shifts.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v19.4s, v0.4s, v2.4s + + /* Compact + splice into 3 output vectors */ + tbl v20.16b, {v16.16b, v17.16b}, v_tbl0.16b + tbl v21.16b, {v17.16b, v18.16b}, v_tbl1.16b + tbl v22.16b, {v18.16b, v19.16b}, v_tbl2.16b + + st1 {v20.16b, v21.16b, v22.16b}, [output], #48 + + subs count, count, #1 + bne polyw1_pack_88_loop + + ret + + .unreq output + .unreq input + .unreq table + .unreq count + .unreq v_shifts + .unreq v_tbl0 + .unreq v_tbl1 + .unreq v_tbl2 +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h index 9e45b661e..cfdbf72f5 100644 --- a/dev/x86_64/meta.h +++ b/dev/x86_64/meta.h @@ -25,6 +25,8 @@ #define MLD_USE_NATIVE_POLY_CHKNORM #define MLD_USE_NATIVE_POLYZ_UNPACK_17 #define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POLYW1_PACK_32 +#define MLD_USE_NATIVE_POLYW1_PACK_88 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 @@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ || MLD_CONFIG_PARAMETER_SET == 87 */ +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_polyw1_pack_32_avx2(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ + +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_polyw1_pack_88_avx2(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ + MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h index 956a6e541..9832c202d 100644 --- a/dev/x86_64/src/arith_native_x86_64.h +++ b/dev/x86_64/src/arith_native_x86_64.h @@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a); #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2) void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); +#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2) +void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a); + +#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2) +void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a); + #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2) void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b, const int32_t *qdata); diff --git a/dev/x86_64/src/polyw1_pack_32_avx2.c b/dev/x86_64/src/polyw1_pack_32_avx2.c new file mode 100644 index 000000000..152a7836b --- /dev/null +++ b/dev/x86_64/src/polyw1_pack_32_avx2.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + +#include +#include "arith_native_x86_64.h" + +/* Pack w1 polynomial (coefficients in [0,15]) for GAMMA2 = (Q-1)/32. + * Packs 2 nibbles per byte; 64 coefficients per iteration. */ +void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a) +{ + unsigned int i; + const __m256i shift = _mm256_set1_epi16((16 << 8) + 1); + const __m256i shufbidx = + _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, 15, + 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + + for (i = 0; i < MLDSA_N / 64; ++i) + { + __m256i f0 = _mm256_load_si256((__m256i *)&a[64 * i + 0]); + __m256i f1 = _mm256_load_si256((__m256i *)&a[64 * i + 8]); + __m256i f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16]); + __m256i f3 = _mm256_load_si256((__m256i *)&a[64 * i + 24]); + __m256i f4 = _mm256_load_si256((__m256i *)&a[64 * i + 32]); + __m256i f5 = _mm256_load_si256((__m256i *)&a[64 * i + 40]); + __m256i f6 = _mm256_load_si256((__m256i *)&a[64 * i + 48]); + __m256i f7 = _mm256_load_si256((__m256i *)&a[64 * i + 56]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f2 = _mm256_packus_epi32(f4, f5); + f3 = _mm256_packus_epi32(f6, f7); + f0 = _mm256_packus_epi16(f0, f1); + f1 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift); + f1 = _mm256_maddubs_epi16(f1, shift); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 65 || MLD_CONFIG_PARAMETER_SET == 87) */ + +MLD_EMPTY_CU(avx2_polyw1_pack_32) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87)) */ diff --git a/dev/x86_64/src/polyw1_pack_88_avx2.c b/dev/x86_64/src/polyw1_pack_88_avx2.c new file mode 100644 index 000000000..3d1fcb19d --- /dev/null +++ b/dev/x86_64/src/polyw1_pack_88_avx2.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + +#include +#include "arith_native_x86_64.h" + +/* Pack w1 polynomial (coefficients in [0,43]) for GAMMA2 = (Q-1)/88. + * 6-bit encoding, 4 coefficients per 3 bytes; 32 coefficients per iteration. */ +void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a) +{ + unsigned int i; + const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1); + const __m256i shift2 = _mm256_set1_epi32(((1 << 12) << 16) + 1); + const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0); + const __m256i shufbidx = + _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, + -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); + + for (i = 0; i < MLDSA_N / 32; i++) + { + __m256i f0 = _mm256_load_si256((__m256i *)&a[32 * i + 0]); + __m256i f1 = _mm256_load_si256((__m256i *)&a[32 * i + 8]); + __m256i f2 = _mm256_load_si256((__m256i *)&a[32 * i + 16]); + __m256i f3 = _mm256_load_si256((__m256i *)&a[32 * i + 24]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_maddubs_epi16(f0, shift1); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2); + + /* Each iteration produces 24 valid bytes in the low 192 bits. + * Store as 128-bit + 64-bit to avoid writing past the output buffer. */ + { + __m128i lo = _mm256_castsi256_si128(f0); + __m128i hi = _mm256_extracti128_si256(f0, 1); + _mm_storeu_si128((__m128i *)&r[24 * i], lo); + _mm_storel_epi64((__m128i *)&r[24 * i + 16], hi); + } + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 44) */ + +MLD_EMPTY_CU(avx2_polyw1_pack_88) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 44)) */ diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 4e02f0d7b..985b48aa4 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -88,6 +88,8 @@ #include "src/native/x86_64/src/poly_decompose_88_avx2.c" #include "src/native/x86_64/src/poly_use_hint_32_avx2.c" #include "src/native/x86_64/src/poly_use_hint_88_avx2.c" +#include "src/native/x86_64/src/polyw1_pack_32_avx2.c" +#include "src/native/x86_64/src/polyw1_pack_88_avx2.c" #include "src/native/x86_64/src/polyz_unpack_17_avx2.c" #include "src/native/x86_64/src/polyz_unpack_19_avx2.c" #include "src/native/x86_64/src/rej_uniform_avx2.c" @@ -617,6 +619,8 @@ #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 +#undef MLD_USE_NATIVE_POLYW1_PACK_32 +#undef MLD_USE_NATIVE_POLYW1_PACK_88 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19 #undef MLD_USE_NATIVE_POLY_CADDQ @@ -648,6 +652,8 @@ #undef mld_polyvecl_pointwise_acc_montgomery_l4_asm #undef mld_polyvecl_pointwise_acc_montgomery_l5_asm #undef mld_polyvecl_pointwise_acc_montgomery_l7_asm +#undef mld_polyw1_pack_32_asm +#undef mld_polyw1_pack_88_asm #undef mld_polyz_unpack_17_asm #undef mld_polyz_unpack_17_indices #undef mld_polyz_unpack_19_asm @@ -672,6 +678,8 @@ #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 +#undef MLD_USE_NATIVE_POLYW1_PACK_32 +#undef MLD_USE_NATIVE_POLYW1_PACK_88 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19 #undef MLD_USE_NATIVE_POLY_CADDQ @@ -701,6 +709,8 @@ #undef mld_poly_decompose_88_avx2 #undef mld_poly_use_hint_32_avx2 #undef mld_poly_use_hint_88_avx2 +#undef mld_polyw1_pack_32_avx2 +#undef mld_polyw1_pack_88_avx2 #undef mld_polyz_unpack_17_avx2 #undef mld_polyz_unpack_19_avx2 #undef mld_rej_uniform_avx2 diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index cee9460ab..dc83a55b0 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -72,6 +72,8 @@ #include "src/native/aarch64/src/poly_decompose_88_asm.S" #include "src/native/aarch64/src/poly_use_hint_32_asm.S" #include "src/native/aarch64/src/poly_use_hint_88_asm.S" +#include "src/native/aarch64/src/polyw1_pack_32_asm.S" +#include "src/native/aarch64/src/polyw1_pack_88_asm.S" #include "src/native/aarch64/src/polyz_unpack_17_asm.S" #include "src/native/aarch64/src/polyz_unpack_19_asm.S" #include "src/native/aarch64/src/rej_uniform_asm.S" @@ -620,6 +622,8 @@ #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 +#undef MLD_USE_NATIVE_POLYW1_PACK_32 +#undef MLD_USE_NATIVE_POLYW1_PACK_88 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19 #undef MLD_USE_NATIVE_POLY_CADDQ @@ -651,6 +655,8 @@ #undef mld_polyvecl_pointwise_acc_montgomery_l4_asm #undef mld_polyvecl_pointwise_acc_montgomery_l5_asm #undef mld_polyvecl_pointwise_acc_montgomery_l7_asm +#undef mld_polyw1_pack_32_asm +#undef mld_polyw1_pack_88_asm #undef mld_polyz_unpack_17_asm #undef mld_polyz_unpack_17_indices #undef mld_polyz_unpack_19_asm @@ -675,6 +681,8 @@ #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 +#undef MLD_USE_NATIVE_POLYW1_PACK_32 +#undef MLD_USE_NATIVE_POLYW1_PACK_88 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19 #undef MLD_USE_NATIVE_POLY_CADDQ @@ -704,6 +712,8 @@ #undef mld_poly_decompose_88_avx2 #undef mld_poly_use_hint_32_avx2 #undef mld_poly_use_hint_88_avx2 +#undef mld_polyw1_pack_32_avx2 +#undef mld_polyw1_pack_88_avx2 #undef mld_polyz_unpack_17_avx2 #undef mld_polyz_unpack_19_avx2 #undef mld_rej_uniform_avx2 diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h index 2923b8c55..c1c1bd4db 100644 --- a/mldsa/src/native/aarch64/meta.h +++ b/mldsa/src/native/aarch64/meta.h @@ -21,6 +21,8 @@ #define MLD_USE_NATIVE_POLY_CHKNORM #define MLD_USE_NATIVE_POLYZ_UNPACK_17 #define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POLYW1_PACK_32 +#define MLD_USE_NATIVE_POLYW1_PACK_88 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 @@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf) #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ || MLD_CONFIG_PARAMETER_SET == 87 */ +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a) +{ + mld_polyw1_pack_32_asm(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ + +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/* Table of constants for polyw1_pack_88_asm: + * [0:15] v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s + * [16:31] v_tbl0: TBL indices for out0 from {v16, v17} + * [32:47] v_tbl1: TBL indices for out1 from {v17, v18} + * [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */ +/* clang-format off */ +MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = { + /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */ + 0, 0, 0, 0, 6, 0, 0, 0, 12, 0, 0, 0, 18, 0, 0, 0, + /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */ + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, + /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */ + 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, + /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */ + 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30, +}; +/* clang-format on */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a) +{ + mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ + MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], diff --git a/mldsa/src/native/aarch64/src/arith_native_aarch64.h b/mldsa/src/native/aarch64/src/arith_native_aarch64.h index f78a1487d..967884699 100644 --- a/mldsa/src/native/aarch64/src/arith_native_aarch64.h +++ b/mldsa/src/native/aarch64/src/arith_native_aarch64.h @@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf, void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf, const uint8_t *indices); +#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm) +void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a); + +#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm) +void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table); + #define mld_poly_pointwise_montgomery_asm \ MLD_NAMESPACE(poly_pointwise_montgomery_asm) void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *, diff --git a/mldsa/src/native/aarch64/src/polyw1_pack_32_asm.S b/mldsa/src/native/aarch64/src/polyw1_pack_32_asm.S new file mode 100644 index 000000000..632af257f --- /dev/null +++ b/mldsa/src/native/aarch64/src/polyw1_pack_32_asm.S @@ -0,0 +1,91 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_opt/src/polyw1_pack_32_asm.S using scripts/simpasm. Do not modify it directly. + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",@progbits +#endif + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm) +MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm) + + .cfi_startproc + mov x2, #0x2 // =2 + +Lpolyw1_pack_32_loop: + ldp q0, q1, [x1], #0x200 + ldp q2, q3, [x1, #-0x1e0] + ldp q4, q5, [x1, #-0x1c0] + ldp q6, q7, [x1, #-0x1a0] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v16.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v16.16b, v0.16b, #0x4 + ldp q0, q1, [x1, #-0x180] + ldp q2, q3, [x1, #-0x160] + ldp q4, q5, [x1, #-0x140] + ldp q6, q7, [x1, #-0x120] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v17.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v17.16b, v0.16b, #0x4 + ldp q0, q1, [x1, #-0x100] + ldp q2, q3, [x1, #-0xe0] + ldp q4, q5, [x1, #-0xc0] + ldp q6, q7, [x1, #-0xa0] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v18.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v18.16b, v0.16b, #0x4 + ldp q0, q1, [x1, #-0x80] + ldp q2, q3, [x1, #-0x60] + ldp q4, q5, [x1, #-0x40] + ldp q6, q7, [x1, #-0x20] + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v0.16b, v0.16b, v2.16b + uzp1 v4.16b, v4.16b, v6.16b + uzp1 v19.16b, v0.16b, v4.16b + uzp2 v0.16b, v0.16b, v4.16b + sli v19.16b, v0.16b, #0x4 + st1 { v16.16b, v17.16b, v18.16b, v19.16b }, [x0], #64 + subs x2, x2, #0x1 + b.ne Lpolyw1_pack_32_loop + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(polyw1_pack_32_asm) + +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ diff --git a/mldsa/src/native/aarch64/src/polyw1_pack_88_asm.S b/mldsa/src/native/aarch64/src/polyw1_pack_88_asm.S new file mode 100644 index 000000000..8e3b2b0dc --- /dev/null +++ b/mldsa/src/native/aarch64/src/polyw1_pack_88_asm.S @@ -0,0 +1,79 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/aarch64_opt/src/polyw1_pack_88_asm.S using scripts/simpasm. Do not modify it directly. + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",@progbits +#endif + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm) +MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm) + + .cfi_startproc + ldp q24, q25, [x2] + ldp q26, q27, [x2, #0x20] + mov x3, #0x4 // =4 + +Lpolyw1_pack_88_loop: + ldp q0, q1, [x1], #0x100 + ldp q2, q3, [x1, #-0xe0] + ushl v0.4s, v0.4s, v24.4s + ushl v1.4s, v1.4s, v24.4s + ushl v2.4s, v2.4s, v24.4s + ushl v3.4s, v3.4s, v24.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v16.4s, v0.4s, v2.4s + ldp q0, q1, [x1, #-0xc0] + ldp q2, q3, [x1, #-0xa0] + ushl v0.4s, v0.4s, v24.4s + ushl v1.4s, v1.4s, v24.4s + ushl v2.4s, v2.4s, v24.4s + ushl v3.4s, v3.4s, v24.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v17.4s, v0.4s, v2.4s + ldp q0, q1, [x1, #-0x80] + ldp q2, q3, [x1, #-0x60] + ushl v0.4s, v0.4s, v24.4s + ushl v1.4s, v1.4s, v24.4s + ushl v2.4s, v2.4s, v24.4s + ushl v3.4s, v3.4s, v24.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v18.4s, v0.4s, v2.4s + ldp q0, q1, [x1, #-0x40] + ldp q2, q3, [x1, #-0x20] + ushl v0.4s, v0.4s, v24.4s + ushl v1.4s, v1.4s, v24.4s + ushl v2.4s, v2.4s, v24.4s + ushl v3.4s, v3.4s, v24.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v19.4s, v0.4s, v2.4s + tbl v20.16b, { v16.16b, v17.16b }, v25.16b + tbl v21.16b, { v17.16b, v18.16b }, v26.16b + tbl v22.16b, { v18.16b, v19.16b }, v27.16b + st1 { v20.16b, v21.16b, v22.16b }, [x0], #48 + subs x3, x3, #0x1 + b.ne Lpolyw1_pack_88_loop + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(polyw1_pack_88_asm) + +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h index 409337fcc..925ac7f4e 100644 --- a/mldsa/src/native/api.h +++ b/mldsa/src/native/api.h @@ -498,6 +498,57 @@ __contract__( || MLD_CONFIG_PARAMETER_SET == 87 */ #endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */ +#if defined(MLD_USE_NATIVE_POLYW1_PACK_32) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +/************************************************* + * Name: mld_polyw1_pack_32_native + * + * Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/32. + * Bit-pack polynomial w1 with coefficients in [0, 15], + * packing 2 nibbles per byte. + * + * Arguments: - uint8_t *r: pointer to output byte array + * - const int32_t *a: pointer to input polynomial coefficients + **************************************************/ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a) +__contract__( + requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES)) + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2))) + assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ +#endif /* MLD_USE_NATIVE_POLYW1_PACK_32 */ + +#if defined(MLD_USE_NATIVE_POLYW1_PACK_88) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/************************************************* + * Name: mld_polyw1_pack_88_native + * + * Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/88. + * Bit-pack polynomial w1 with coefficients in [0, 43], + * using 6-bit encoding (4 coefficients -> 3 bytes). + * + * Arguments: - uint8_t *r: pointer to output byte array + * - const int32_t *a: pointer to input polynomial coefficients + **************************************************/ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a) +__contract__( + requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES)) + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2))) + assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ +#endif /* MLD_USE_NATIVE_POLYW1_PACK_88 */ + #if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) /************************************************* * Name: mld_poly_pointwise_montgomery_native diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h index 9e45b661e..cfdbf72f5 100644 --- a/mldsa/src/native/x86_64/meta.h +++ b/mldsa/src/native/x86_64/meta.h @@ -25,6 +25,8 @@ #define MLD_USE_NATIVE_POLY_CHKNORM #define MLD_USE_NATIVE_POLYZ_UNPACK_17 #define MLD_USE_NATIVE_POLYZ_UNPACK_19 +#define MLD_USE_NATIVE_POLYW1_PACK_32 +#define MLD_USE_NATIVE_POLYW1_PACK_88 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 @@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ || MLD_CONFIG_PARAMETER_SET == 87 */ +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_polyw1_pack_32_avx2(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ + +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_polyw1_pack_88_avx2(r, a); + return MLD_NATIVE_FUNC_SUCCESS; +} +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ + MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h index 956a6e541..9832c202d 100644 --- a/mldsa/src/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/src/native/x86_64/src/arith_native_x86_64.h @@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a); #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2) void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); +#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2) +void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a); + +#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2) +void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a); + #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2) void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b, const int32_t *qdata); diff --git a/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c b/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c new file mode 100644 index 000000000..152a7836b --- /dev/null +++ b/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + +#include +#include "arith_native_x86_64.h" + +/* Pack w1 polynomial (coefficients in [0,15]) for GAMMA2 = (Q-1)/32. + * Packs 2 nibbles per byte; 64 coefficients per iteration. */ +void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a) +{ + unsigned int i; + const __m256i shift = _mm256_set1_epi16((16 << 8) + 1); + const __m256i shufbidx = + _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, 15, + 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + + for (i = 0; i < MLDSA_N / 64; ++i) + { + __m256i f0 = _mm256_load_si256((__m256i *)&a[64 * i + 0]); + __m256i f1 = _mm256_load_si256((__m256i *)&a[64 * i + 8]); + __m256i f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16]); + __m256i f3 = _mm256_load_si256((__m256i *)&a[64 * i + 24]); + __m256i f4 = _mm256_load_si256((__m256i *)&a[64 * i + 32]); + __m256i f5 = _mm256_load_si256((__m256i *)&a[64 * i + 40]); + __m256i f6 = _mm256_load_si256((__m256i *)&a[64 * i + 48]); + __m256i f7 = _mm256_load_si256((__m256i *)&a[64 * i + 56]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f2 = _mm256_packus_epi32(f4, f5); + f3 = _mm256_packus_epi32(f6, f7); + f0 = _mm256_packus_epi16(f0, f1); + f1 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift); + f1 = _mm256_maddubs_epi16(f1, shift); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 65 || MLD_CONFIG_PARAMETER_SET == 87) */ + +MLD_EMPTY_CU(avx2_polyw1_pack_32) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87)) */ diff --git a/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c b/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c new file mode 100644 index 000000000..3d1fcb19d --- /dev/null +++ b/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + +#include +#include "arith_native_x86_64.h" + +/* Pack w1 polynomial (coefficients in [0,43]) for GAMMA2 = (Q-1)/88. + * 6-bit encoding, 4 coefficients per 3 bytes; 32 coefficients per iteration. */ +void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a) +{ + unsigned int i; + const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1); + const __m256i shift2 = _mm256_set1_epi32(((1 << 12) << 16) + 1); + const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0); + const __m256i shufbidx = + _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, + -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); + + for (i = 0; i < MLDSA_N / 32; i++) + { + __m256i f0 = _mm256_load_si256((__m256i *)&a[32 * i + 0]); + __m256i f1 = _mm256_load_si256((__m256i *)&a[32 * i + 8]); + __m256i f2 = _mm256_load_si256((__m256i *)&a[32 * i + 16]); + __m256i f3 = _mm256_load_si256((__m256i *)&a[32 * i + 24]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_maddubs_epi16(f0, shift1); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2); + + /* Each iteration produces 24 valid bytes in the low 192 bits. + * Store as 128-bit + 64-bit to avoid writing past the output buffer. */ + { + __m128i lo = _mm256_castsi256_si128(f0); + __m128i hi = _mm256_extracti128_si256(f0, 1); + _mm_storeu_si128((__m128i *)&r[24 * i], lo); + _mm_storel_epi64((__m128i *)&r[24 * i + 16], hi); + } + } +} + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 44) */ + +MLD_EMPTY_CU(avx2_polyw1_pack_88) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 44)) */ diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c index da7c826a4..1dfeb52a3 100644 --- a/mldsa/src/poly_kl.c +++ b/mldsa/src/poly_kl.c @@ -888,31 +888,53 @@ void mld_polyz_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYZ_PACKEDBYTES]) MLD_INTERNAL_API void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a) { - unsigned int i; - +#if defined(MLD_USE_NATIVE_POLYW1_PACK_88) && MLD_CONFIG_PARAMETER_SET == 44 + int ret; + mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); + ret = mld_polyw1_pack_88_native(r, a->coeffs); + if (ret == MLD_NATIVE_FUNC_SUCCESS) + { + return; + } +#elif defined(MLD_USE_NATIVE_POLYW1_PACK_32) && \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) + int ret; mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); + ret = mld_polyw1_pack_32_native(r, a->coeffs); + if (ret == MLD_NATIVE_FUNC_SUCCESS) + { + return; + } +#endif /* !(MLD_USE_NATIVE_POLYW1_PACK_88 && MLD_CONFIG_PARAMETER_SET == 44) \ + && MLD_USE_NATIVE_POLYW1_PACK_32 && (MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ + { + unsigned int i; + + mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); #if MLD_CONFIG_PARAMETER_SET == 44 - for (i = 0; i < MLDSA_N / 4; ++i) - __loop__( + for (i = 0; i < MLDSA_N / 4; ++i) + __loop__( invariant(i <= MLDSA_N/4)) - { - r[3 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0]) & 0xFF); - r[3 * i + 0] |= (uint8_t)((a->coeffs[4 * i + 1] << 6) & 0xFF); - r[3 * i + 1] = (uint8_t)((a->coeffs[4 * i + 1] >> 2) & 0xFF); - r[3 * i + 1] |= (uint8_t)((a->coeffs[4 * i + 2] << 4) & 0xFF); - r[3 * i + 2] = (uint8_t)((a->coeffs[4 * i + 2] >> 4) & 0xFF); - r[3 * i + 2] |= (uint8_t)((a->coeffs[4 * i + 3] << 2) & 0xFF); - } + { + r[3 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0]) & 0xFF); + r[3 * i + 0] |= (uint8_t)((a->coeffs[4 * i + 1] << 6) & 0xFF); + r[3 * i + 1] = (uint8_t)((a->coeffs[4 * i + 1] >> 2) & 0xFF); + r[3 * i + 1] |= (uint8_t)((a->coeffs[4 * i + 2] << 4) & 0xFF); + r[3 * i + 2] = (uint8_t)((a->coeffs[4 * i + 2] >> 4) & 0xFF); + r[3 * i + 2] |= (uint8_t)((a->coeffs[4 * i + 3] << 2) & 0xFF); + } #else /* MLD_CONFIG_PARAMETER_SET == 44 */ - for (i = 0; i < MLDSA_N / 2; ++i) - __loop__( + for (i = 0; i < MLDSA_N / 2; ++i) + __loop__( invariant(i <= MLDSA_N/2)) - { - r[i] = - (uint8_t)((a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)) & 0xFF); - } + { + r[i] = (uint8_t)((a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)) & + 0xFF); + } #endif /* MLD_CONFIG_PARAMETER_SET != 44 */ + } } /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */ diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c index b6261f156..308555cff 100644 --- a/test/bench/bench_components_mldsa.c +++ b/test/bench/bench_components_mldsa.c @@ -9,6 +9,7 @@ #include #include #include "../../mldsa/src/poly.h" +#include "../../mldsa/src/poly_kl.h" #include "../../mldsa/src/polyvec.h" #include "../../mldsa/src/randombytes.h" #include "hal.h" @@ -80,6 +81,18 @@ static int bench(void) mld_polyvec_matrix_pointwise_montgomery(&polyveck_out, &polymat, &polyvecl_b)) + /* polyw1_pack: set up valid input with coefficients in [0, (Q-1)/(2*GAMMA2)) + */ + { + MLD_ALIGN uint8_t w1_packed[MLDSA_POLYW1_PACKEDBYTES]; + MLD_ALIGN mld_poly w1_poly; + for (i = 0; i < MLDSA_N; i++) + { + w1_poly.coeffs[i] = (int32_t)(i % ((MLDSA_Q - 1) / (2 * MLDSA_GAMMA2))); + } + BENCH("polyw1_pack", mld_polyw1_pack(w1_packed, &w1_poly)) + } + return 0; }