diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
index 9783c2b7a..ab09ea418 100644
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@@ -225,6 +225,8 @@ source code and documentation.
   - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
   - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
   - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
+  - [dev/x86_64/src/polyw1_pack_32_avx2.c](dev/x86_64/src/polyw1_pack_32_avx2.c)
+  - [dev/x86_64/src/polyw1_pack_88_avx2.c](dev/x86_64/src/polyw1_pack_88_avx2.c)
   - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
   - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
   - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
@@ -243,6 +245,8 @@ source code and documentation.
   - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
+  - [mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c)
+  - [mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
   - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)
diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index 2923b8c55..c1c1bd4db 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -21,6 +21,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_32_asm(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/* Table of constants for polyw1_pack_88_asm:
+ *   [0:15]  v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
+ *   [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
+ *   [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
+ *   [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
+/* clang-format off */
+MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
+  /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
+  0, 0, 0, 0,  6, 0, 0, 0,  12, 0, 0, 0,  18, 0, 0, 0,
+  /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
+  0, 1, 2, 4,  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,
+  /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
+  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,
+  /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
+  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,  26, 28, 29, 30,
+};
+/* clang-format on */
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
diff --git a/dev/aarch64_clean/src/arith_native_aarch64.h b/dev/aarch64_clean/src/arith_native_aarch64.h
index f78a1487d..967884699 100644
--- a/dev/aarch64_clean/src/arith_native_aarch64.h
+++ b/dev/aarch64_clean/src/arith_native_aarch64.h
@@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
 void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
                              const uint8_t *indices);
 
+#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
+void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
+void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);
+
 #define mld_poly_pointwise_montgomery_asm \
   MLD_NAMESPACE(poly_pointwise_montgomery_asm)
 void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,
diff --git a/dev/aarch64_clean/src/polyw1_pack_32_asm.S b/dev/aarch64_clean/src/polyw1_pack_32_asm.S
new file mode 100644
index 000000000..0598b71de
--- /dev/null
+++ b/dev/aarch64_clean/src/polyw1_pack_32_asm.S
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+            MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+/* simpasm: header-end */
+
+/*
+ * polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32.
+ *
+ * Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word.
+ * Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4)
+ * 256 coefficients -> 128 output bytes.
+ *
+ * UZP1 narrowing chain (32->16->8 bit) extracts the low byte from
+ * each coefficient; UZP1/UZP2 separate even/odd coefficients;
+ * SLI shifts and inserts the odd nibbles.
+ *
+ * 4x unrolled, 2 iterations for 256 coefficients.
+ */
+
+        output          .req x0
+        input           .req x1
+        count           .req x2
+
+.text
+.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)
+
+        mov count, #(256 / (32 * 4))
+
+polyw1_pack_32_loop:
+
+        /* Block 0: coefficients 0-31 */
+        ldp q0, q1, [input], #512
+        ldp q2, q3, [input, #(32 - 512)]
+        ldp q4, q5, [input, #(64 - 512)]
+        ldp q6, q7, [input, #(96 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v16.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v16.16b, v0.16b, #4
+
+        /* Block 1: coefficients 32-63 */
+        ldp q0, q1, [input, #(128 - 512)]
+        ldp q2, q3, [input, #(160 - 512)]
+        ldp q4, q5, [input, #(192 - 512)]
+        ldp q6, q7, [input, #(224 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v17.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v17.16b, v0.16b, #4
+
+        /* Block 2: coefficients 64-95 */
+        ldp q0, q1, [input, #(256 - 512)]
+        ldp q2, q3, [input, #(288 - 512)]
+        ldp q4, q5, [input, #(320 - 512)]
+        ldp q6, q7, [input, #(352 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v18.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v18.16b, v0.16b, #4
+
+        /* Block 3: coefficients 96-127 */
+        ldp q0, q1, [input, #(384 - 512)]
+        ldp q2, q3, [input, #(416 - 512)]
+        ldp q4, q5, [input, #(448 - 512)]
+        ldp q6, q7, [input, #(480 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v19.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v19.16b, v0.16b, #4
+
+        st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [output], #64
+
+        subs count, count, #1
+        bne polyw1_pack_32_loop
+
+        ret
+
+        .unreq output
+        .unreq input
+        .unreq count
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
diff --git a/dev/aarch64_clean/src/polyw1_pack_88_asm.S b/dev/aarch64_clean/src/polyw1_pack_88_asm.S
new file mode 100644
index 000000000..875fc177b
--- /dev/null
+++ b/dev/aarch64_clean/src/polyw1_pack_88_asm.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
+/* simpasm: header-end */
+
+/*
+ * polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88.
+ *
+ * Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word.
+ * Pack 4 coefficients into 3 bytes:
+ *   r[3i+0] =  a[4i+0]       | (a[4i+1] << 6)
+ *   r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4)
+ *   r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2)
+ * 256 coefficients -> 192 output bytes.
+ *
+ * Each group of 4 coefficients in a .4s vector is shifted to its
+ * bit position using USHL, then reduced with ADDP to form one
+ * 24-bit packed value per 32-bit lane.
+ *
+ * Three 2-register TBL instructions then extract the useful 3 bytes
+ * from each 32-bit lane across pairs of adjacent result vectors,
+ * producing 3 contiguous 16-byte output vectors (48 bytes total).
+ *
+ * 4x unrolled, 4 iterations for 256 coefficients.
+ */
+
+        output          .req x0
+        input           .req x1
+        table           .req x2
+        count           .req x3
+
+        v_shifts        .req v24
+        v_tbl0          .req v25
+        v_tbl1          .req v26
+        v_tbl2          .req v27
+
+.text
+.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm)
+
+        /* Load constants from table pointer (x2):
+         * [0:15]  = v_shifts.4s = {0, 6, 12, 18}
+         * [16:31] = v_tbl0: TBL indices for out0 from {v16, v17}
+         * [32:47] = v_tbl1: TBL indices for out1 from {v17, v18}
+         * [48:63] = v_tbl2: TBL indices for out2 from {v18, v19} */
+        ldp q24, q25, [table]
+        ldp q26, q27, [table, #32]
+
+        mov count, #(256 / (16 * 4))
+
+polyw1_pack_88_loop:
+
+        /* Block 0: coefficients 0-15 */
+        ldp q0, q1, [input], #256
+        ldp q2, q3, [input, #(32 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v16.4s, v0.4s, v2.4s
+
+        /* Block 1: coefficients 16-31 */
+        ldp q0, q1, [input, #(64 - 256)]
+        ldp q2, q3, [input, #(96 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v17.4s, v0.4s, v2.4s
+
+        /* Block 2: coefficients 32-47 */
+        ldp q0, q1, [input, #(128 - 256)]
+        ldp q2, q3, [input, #(160 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v18.4s, v0.4s, v2.4s
+
+        /* Block 3: coefficients 48-63 */
+        ldp q0, q1, [input, #(192 - 256)]
+        ldp q2, q3, [input, #(224 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v19.4s, v0.4s, v2.4s
+
+        /* Compact + splice into 3 output vectors */
+        tbl v20.16b, {v16.16b, v17.16b}, v_tbl0.16b
+        tbl v21.16b, {v17.16b, v18.16b}, v_tbl1.16b
+        tbl v22.16b, {v18.16b, v19.16b}, v_tbl2.16b
+
+        st1 {v20.16b, v21.16b, v22.16b}, [output], #48
+
+        subs count, count, #1
+        bne polyw1_pack_88_loop
+
+        ret
+
+        .unreq output
+        .unreq input
+        .unreq table
+        .unreq count
+        .unreq v_shifts
+        .unreq v_tbl0
+        .unreq v_tbl1
+        .unreq v_tbl2
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
+        */
diff --git a/dev/aarch64_opt/meta.h b/dev/aarch64_opt/meta.h
index 2923b8c55..c1c1bd4db 100644
--- a/dev/aarch64_opt/meta.h
+++ b/dev/aarch64_opt/meta.h
@@ -21,6 +21,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_32_asm(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/* Table of constants for polyw1_pack_88_asm:
+ *   [0:15]  v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
+ *   [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
+ *   [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
+ *   [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
+/* clang-format off */
+MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
+  /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
+  0, 0, 0, 0,  6, 0, 0, 0,  12, 0, 0, 0,  18, 0, 0, 0,
+  /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
+  0, 1, 2, 4,  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,
+  /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
+  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,
+  /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
+  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,  26, 28, 29, 30,
+};
+/* clang-format on */
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
diff --git a/dev/aarch64_opt/src/arith_native_aarch64.h b/dev/aarch64_opt/src/arith_native_aarch64.h
index f78a1487d..967884699 100644
--- a/dev/aarch64_opt/src/arith_native_aarch64.h
+++ b/dev/aarch64_opt/src/arith_native_aarch64.h
@@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
 void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
                              const uint8_t *indices);
 
+#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
+void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
+void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);
+
 #define mld_poly_pointwise_montgomery_asm \
   MLD_NAMESPACE(poly_pointwise_montgomery_asm)
 void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,
diff --git a/dev/aarch64_opt/src/polyw1_pack_32_asm.S b/dev/aarch64_opt/src/polyw1_pack_32_asm.S
new file mode 100644
index 000000000..0598b71de
--- /dev/null
+++ b/dev/aarch64_opt/src/polyw1_pack_32_asm.S
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+            MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+/* simpasm: header-end */
+
+/*
+ * polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32.
+ *
+ * Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word.
+ * Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4)
+ * 256 coefficients -> 128 output bytes.
+ *
+ * UZP1 narrowing chain (32->16->8 bit) extracts the low byte from
+ * each coefficient; UZP1/UZP2 separate even/odd coefficients;
+ * SLI shifts and inserts the odd nibbles.
+ *
+ * 4x unrolled, 2 iterations for 256 coefficients.
+ */
+
+        output          .req x0
+        input           .req x1
+        count           .req x2
+
+.text
+.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)
+
+        mov count, #(256 / (32 * 4))
+
+polyw1_pack_32_loop:
+
+        /* Block 0: coefficients 0-31 */
+        ldp q0, q1, [input], #512
+        ldp q2, q3, [input, #(32 - 512)]
+        ldp q4, q5, [input, #(64 - 512)]
+        ldp q6, q7, [input, #(96 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v16.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v16.16b, v0.16b, #4
+
+        /* Block 1: coefficients 32-63 */
+        ldp q0, q1, [input, #(128 - 512)]
+        ldp q2, q3, [input, #(160 - 512)]
+        ldp q4, q5, [input, #(192 - 512)]
+        ldp q6, q7, [input, #(224 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v17.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v17.16b, v0.16b, #4
+
+        /* Block 2: coefficients 64-95 */
+        ldp q0, q1, [input, #(256 - 512)]
+        ldp q2, q3, [input, #(288 - 512)]
+        ldp q4, q5, [input, #(320 - 512)]
+        ldp q6, q7, [input, #(352 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v18.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v18.16b, v0.16b, #4
+
+        /* Block 3: coefficients 96-127 */
+        ldp q0, q1, [input, #(384 - 512)]
+        ldp q2, q3, [input, #(416 - 512)]
+        ldp q4, q5, [input, #(448 - 512)]
+        ldp q6, q7, [input, #(480 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v19.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v19.16b, v0.16b, #4
+
+        st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [output], #64
+
+        subs count, count, #1
+        bne polyw1_pack_32_loop
+
+        ret
+
+        .unreq output
+        .unreq input
+        .unreq count
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
diff --git a/dev/aarch64_opt/src/polyw1_pack_88_asm.S b/dev/aarch64_opt/src/polyw1_pack_88_asm.S
new file mode 100644
index 000000000..875fc177b
--- /dev/null
+++ b/dev/aarch64_opt/src/polyw1_pack_88_asm.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
+/* simpasm: header-end */
+
+/*
+ * polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88.
+ *
+ * Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word.
+ * Pack 4 coefficients into 3 bytes:
+ *   r[3i+0] =  a[4i+0]       | (a[4i+1] << 6)
+ *   r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4)
+ *   r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2)
+ * 256 coefficients -> 192 output bytes.
+ *
+ * Each group of 4 coefficients in a .4s vector is shifted to its
+ * bit position using USHL, then reduced with ADDP to form one
+ * 24-bit packed value per 32-bit lane.
+ *
+ * Three 2-register TBL instructions then extract the useful 3 bytes
+ * from each 32-bit lane across pairs of adjacent result vectors,
+ * producing 3 contiguous 16-byte output vectors (48 bytes total).
+ *
+ * 4x unrolled, 4 iterations for 256 coefficients.
+ */
+
+        output          .req x0
+        input           .req x1
+        table           .req x2
+        count           .req x3
+
+        v_shifts        .req v24
+        v_tbl0          .req v25
+        v_tbl1          .req v26
+        v_tbl2          .req v27
+
+.text
+.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm)
+
+        /* Load constants from table pointer (x2):
+         * [0:15]  = v_shifts.4s = {0, 6, 12, 18}
+         * [16:31] = v_tbl0: TBL indices for out0 from {v16, v17}
+         * [32:47] = v_tbl1: TBL indices for out1 from {v17, v18}
+         * [48:63] = v_tbl2: TBL indices for out2 from {v18, v19} */
+        ldp q24, q25, [table]
+        ldp q26, q27, [table, #32]
+
+        mov count, #(256 / (16 * 4))
+
+polyw1_pack_88_loop:
+
+        /* Block 0: coefficients 0-15 */
+        ldp q0, q1, [input], #256
+        ldp q2, q3, [input, #(32 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v16.4s, v0.4s, v2.4s
+
+        /* Block 1: coefficients 16-31 */
+        ldp q0, q1, [input, #(64 - 256)]
+        ldp q2, q3, [input, #(96 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v17.4s, v0.4s, v2.4s
+
+        /* Block 2: coefficients 32-47 */
+        ldp q0, q1, [input, #(128 - 256)]
+        ldp q2, q3, [input, #(160 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v18.4s, v0.4s, v2.4s
+
+        /* Block 3: coefficients 48-63 */
+        ldp q0, q1, [input, #(192 - 256)]
+        ldp q2, q3, [input, #(224 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v19.4s, v0.4s, v2.4s
+
+        /* Compact + splice into 3 output vectors */
+        tbl v20.16b, {v16.16b, v17.16b}, v_tbl0.16b
+        tbl v21.16b, {v17.16b, v18.16b}, v_tbl1.16b
+        tbl v22.16b, {v18.16b, v19.16b}, v_tbl2.16b
+
+        st1 {v20.16b, v21.16b, v22.16b}, [output], #48
+
+        subs count, count, #1
+        bne polyw1_pack_88_loop
+
+        ret
+
+        .unreq output
+        .unreq input
+        .unreq table
+        .unreq count
+        .unreq v_shifts
+        .unreq v_tbl0
+        .unreq v_tbl1
+        .unreq v_tbl2
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
+        */
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 9e45b661e..cfdbf72f5 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -25,6 +25,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_32_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_88_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h
index 956a6e541..9832c202d 100644
--- a/dev/x86_64/src/arith_native_x86_64.h
+++ b/dev/x86_64/src/arith_native_x86_64.h
@@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
 #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
 void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
 
+#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2)
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2)
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a);
+
 #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
 void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
                         const int32_t *qdata);
diff --git a/dev/x86_64/src/polyw1_pack_32_avx2.c b/dev/x86_64/src/polyw1_pack_32_avx2.c
new file mode 100644
index 000000000..152a7836b
--- /dev/null
+++ b/dev/x86_64/src/polyw1_pack_32_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
+
+#include <immintrin.h>
+#include "arith_native_x86_64.h"
+
+/* Pack w1 polynomial (coefficients in [0,15]) for GAMMA2 = (Q-1)/32.
+ * Packs 2 nibbles per byte; 64 coefficients per iteration. */
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a)
+{
+  unsigned int i;
+  const __m256i shift = _mm256_set1_epi16((16 << 8) + 1);
+  const __m256i shufbidx =
+      _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, 15,
+                      14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+
+  for (i = 0; i < MLDSA_N / 64; ++i)
+  {
+    __m256i f0 = _mm256_load_si256((__m256i *)&a[64 * i + 0]);
+    __m256i f1 = _mm256_load_si256((__m256i *)&a[64 * i + 8]);
+    __m256i f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16]);
+    __m256i f3 = _mm256_load_si256((__m256i *)&a[64 * i + 24]);
+    __m256i f4 = _mm256_load_si256((__m256i *)&a[64 * i + 32]);
+    __m256i f5 = _mm256_load_si256((__m256i *)&a[64 * i + 40]);
+    __m256i f6 = _mm256_load_si256((__m256i *)&a[64 * i + 48]);
+    __m256i f7 = _mm256_load_si256((__m256i *)&a[64 * i + 56]);
+    f0 = _mm256_packus_epi32(f0, f1);
+    f1 = _mm256_packus_epi32(f2, f3);
+    f2 = _mm256_packus_epi32(f4, f5);
+    f3 = _mm256_packus_epi32(f6, f7);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f1 = _mm256_packus_epi16(f2, f3);
+    f0 = _mm256_maddubs_epi16(f0, shift);
+    f1 = _mm256_maddubs_epi16(f1, shift);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
+    f0 = _mm256_shuffle_epi8(f0, shufbidx);
+    _mm256_storeu_si256((__m256i *)&r[32 * i], f0);
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+         65 || MLD_CONFIG_PARAMETER_SET == 87) */
+
+MLD_EMPTY_CU(avx2_polyw1_pack_32)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                                \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87)) */
diff --git a/dev/x86_64/src/polyw1_pack_88_avx2.c b/dev/x86_64/src/polyw1_pack_88_avx2.c
new file mode 100644
index 000000000..3d1fcb19d
--- /dev/null
+++ b/dev/x86_64/src/polyw1_pack_88_avx2.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     MLD_CONFIG_PARAMETER_SET == 44)
+
+#include <immintrin.h>
+#include "arith_native_x86_64.h"
+
+/* Pack w1 polynomial (coefficients in [0,43]) for GAMMA2 = (Q-1)/88.
+ * 6-bit encoding, 4 coefficients per 3 bytes; 32 coefficients per iteration. */
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a)
+{
+  unsigned int i;
+  const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1);
+  const __m256i shift2 = _mm256_set1_epi32(((1 << 12) << 16) + 1);
+  const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0);
+  const __m256i shufbidx =
+      _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
+                      -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+
+  for (i = 0; i < MLDSA_N / 32; i++)
+  {
+    __m256i f0 = _mm256_load_si256((__m256i *)&a[32 * i + 0]);
+    __m256i f1 = _mm256_load_si256((__m256i *)&a[32 * i + 8]);
+    __m256i f2 = _mm256_load_si256((__m256i *)&a[32 * i + 16]);
+    __m256i f3 = _mm256_load_si256((__m256i *)&a[32 * i + 24]);
+    f0 = _mm256_packus_epi32(f0, f1);
+    f1 = _mm256_packus_epi32(f2, f3);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f0 = _mm256_maddubs_epi16(f0, shift1);
+    f0 = _mm256_madd_epi16(f0, shift2);
+    f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1);
+    f0 = _mm256_shuffle_epi8(f0, shufbidx);
+    f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2);
+
+    /* Each iteration produces 24 valid bytes in the low 192 bits.
+     * Store as 128-bit + 64-bit to avoid writing past the output buffer. */
+    {
+      __m128i lo = _mm256_castsi256_si128(f0);
+      __m128i hi = _mm256_extracti128_si256(f0, 1);
+      _mm_storeu_si128((__m128i *)&r[24 * i], lo);
+      _mm_storel_epi64((__m128i *)&r[24 * i + 16], hi);
+    }
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+         44) */
+
+MLD_EMPTY_CU(avx2_polyw1_pack_88)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                             \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                               \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          44)) */
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 4e02f0d7b..985b48aa4 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -88,6 +88,8 @@
 #include "src/native/x86_64/src/poly_decompose_88_avx2.c"
 #include "src/native/x86_64/src/poly_use_hint_32_avx2.c"
 #include "src/native/x86_64/src/poly_use_hint_88_avx2.c"
+#include "src/native/x86_64/src/polyw1_pack_32_avx2.c"
+#include "src/native/x86_64/src/polyw1_pack_88_avx2.c"
 #include "src/native/x86_64/src/polyz_unpack_17_avx2.c"
 #include "src/native/x86_64/src/polyz_unpack_19_avx2.c"
 #include "src/native/x86_64/src/rej_uniform_avx2.c"
@@ -617,6 +619,8 @@
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYW1_PACK_32
+#undef MLD_USE_NATIVE_POLYW1_PACK_88
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19
 #undef MLD_USE_NATIVE_POLY_CADDQ
@@ -648,6 +652,8 @@
 #undef mld_polyvecl_pointwise_acc_montgomery_l4_asm
 #undef mld_polyvecl_pointwise_acc_montgomery_l5_asm
 #undef mld_polyvecl_pointwise_acc_montgomery_l7_asm
+#undef mld_polyw1_pack_32_asm
+#undef mld_polyw1_pack_88_asm
 #undef mld_polyz_unpack_17_asm
 #undef mld_polyz_unpack_17_indices
 #undef mld_polyz_unpack_19_asm
@@ -672,6 +678,8 @@
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYW1_PACK_32
+#undef MLD_USE_NATIVE_POLYW1_PACK_88
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19
 #undef MLD_USE_NATIVE_POLY_CADDQ
@@ -701,6 +709,8 @@
 #undef mld_poly_decompose_88_avx2
 #undef mld_poly_use_hint_32_avx2
 #undef mld_poly_use_hint_88_avx2
+#undef mld_polyw1_pack_32_avx2
+#undef mld_polyw1_pack_88_avx2
 #undef mld_polyz_unpack_17_avx2
 #undef mld_polyz_unpack_19_avx2
 #undef mld_rej_uniform_avx2
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
index cee9460ab..dc83a55b0 100644
--- a/mldsa/mldsa_native_asm.S
+++ b/mldsa/mldsa_native_asm.S
@@ -72,6 +72,8 @@
 #include "src/native/aarch64/src/poly_decompose_88_asm.S"
 #include "src/native/aarch64/src/poly_use_hint_32_asm.S"
 #include "src/native/aarch64/src/poly_use_hint_88_asm.S"
+#include "src/native/aarch64/src/polyw1_pack_32_asm.S"
+#include "src/native/aarch64/src/polyw1_pack_88_asm.S"
 #include "src/native/aarch64/src/polyz_unpack_17_asm.S"
 #include "src/native/aarch64/src/polyz_unpack_19_asm.S"
 #include "src/native/aarch64/src/rej_uniform_asm.S"
@@ -620,6 +622,8 @@
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYW1_PACK_32
+#undef MLD_USE_NATIVE_POLYW1_PACK_88
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19
 #undef MLD_USE_NATIVE_POLY_CADDQ
@@ -651,6 +655,8 @@
 #undef mld_polyvecl_pointwise_acc_montgomery_l4_asm
 #undef mld_polyvecl_pointwise_acc_montgomery_l5_asm
 #undef mld_polyvecl_pointwise_acc_montgomery_l7_asm
+#undef mld_polyw1_pack_32_asm
+#undef mld_polyw1_pack_88_asm
 #undef mld_polyz_unpack_17_asm
 #undef mld_polyz_unpack_17_indices
 #undef mld_polyz_unpack_19_asm
@@ -675,6 +681,8 @@
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYW1_PACK_32
+#undef MLD_USE_NATIVE_POLYW1_PACK_88
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19
 #undef MLD_USE_NATIVE_POLY_CADDQ
@@ -704,6 +712,8 @@
 #undef mld_poly_decompose_88_avx2
 #undef mld_poly_use_hint_32_avx2
 #undef mld_poly_use_hint_88_avx2
+#undef mld_polyw1_pack_32_avx2
+#undef mld_polyw1_pack_88_avx2
 #undef mld_polyz_unpack_17_avx2
 #undef mld_polyz_unpack_19_avx2
 #undef mld_rej_uniform_avx2
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index 2923b8c55..c1c1bd4db 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -21,6 +21,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_32_asm(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/* Table of constants for polyw1_pack_88_asm:
+ *   [0:15]  v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
+ *   [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
+ *   [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
+ *   [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
+/* clang-format off */
+MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
+  /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
+  0, 0, 0, 0,  6, 0, 0, 0,  12, 0, 0, 0,  18, 0, 0, 0,
+  /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
+  0, 1, 2, 4,  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,
+  /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
+  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,
+  /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
+  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,  26, 28, 29, 30,
+};
+/* clang-format on */
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
diff --git a/mldsa/src/native/aarch64/src/arith_native_aarch64.h b/mldsa/src/native/aarch64/src/arith_native_aarch64.h
index f78a1487d..967884699 100644
--- a/mldsa/src/native/aarch64/src/arith_native_aarch64.h
+++ b/mldsa/src/native/aarch64/src/arith_native_aarch64.h
@@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
 void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
                              const uint8_t *indices);
 
+#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
+void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
+void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);
+
 #define mld_poly_pointwise_montgomery_asm \
   MLD_NAMESPACE(poly_pointwise_montgomery_asm)
 void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,
diff --git a/mldsa/src/native/aarch64/src/polyw1_pack_32_asm.S b/mldsa/src/native/aarch64/src/polyw1_pack_32_asm.S
new file mode 100644
index 000000000..632af257f
--- /dev/null
+++ b/mldsa/src/native/aarch64/src/polyw1_pack_32_asm.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+            MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/aarch64_opt/src/polyw1_pack_32_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
+MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)
+
+        .cfi_startproc
+        mov	x2, #0x2                // =2
+
+Lpolyw1_pack_32_loop:
+        ldp	q0, q1, [x1], #0x200
+        ldp	q2, q3, [x1, #-0x1e0]
+        ldp	q4, q5, [x1, #-0x1c0]
+        ldp	q6, q7, [x1, #-0x1a0]
+        uzp1	v0.8h, v0.8h, v1.8h
+        uzp1	v2.8h, v2.8h, v3.8h
+        uzp1	v4.8h, v4.8h, v5.8h
+        uzp1	v6.8h, v6.8h, v7.8h
+        uzp1	v0.16b, v0.16b, v2.16b
+        uzp1	v4.16b, v4.16b, v6.16b
+        uzp1	v16.16b, v0.16b, v4.16b
+        uzp2	v0.16b, v0.16b, v4.16b
+        sli	v16.16b, v0.16b, #0x4
+        ldp	q0, q1, [x1, #-0x180]
+        ldp	q2, q3, [x1, #-0x160]
+        ldp	q4, q5, [x1, #-0x140]
+        ldp	q6, q7, [x1, #-0x120]
+        uzp1	v0.8h, v0.8h, v1.8h
+        uzp1	v2.8h, v2.8h, v3.8h
+        uzp1	v4.8h, v4.8h, v5.8h
+        uzp1	v6.8h, v6.8h, v7.8h
+        uzp1	v0.16b, v0.16b, v2.16b
+        uzp1	v4.16b, v4.16b, v6.16b
+        uzp1	v17.16b, v0.16b, v4.16b
+        uzp2	v0.16b, v0.16b, v4.16b
+        sli	v17.16b, v0.16b, #0x4
+        ldp	q0, q1, [x1, #-0x100]
+        ldp	q2, q3, [x1, #-0xe0]
+        ldp	q4, q5, [x1, #-0xc0]
+        ldp	q6, q7, [x1, #-0xa0]
+        uzp1	v0.8h, v0.8h, v1.8h
+        uzp1	v2.8h, v2.8h, v3.8h
+        uzp1	v4.8h, v4.8h, v5.8h
+        uzp1	v6.8h, v6.8h, v7.8h
+        uzp1	v0.16b, v0.16b, v2.16b
+        uzp1	v4.16b, v4.16b, v6.16b
+        uzp1	v18.16b, v0.16b, v4.16b
+        uzp2	v0.16b, v0.16b, v4.16b
+        sli	v18.16b, v0.16b, #0x4
+        ldp	q0, q1, [x1, #-0x80]
+        ldp	q2, q3, [x1, #-0x60]
+        ldp	q4, q5, [x1, #-0x40]
+        ldp	q6, q7, [x1, #-0x20]
+        uzp1	v0.8h, v0.8h, v1.8h
+        uzp1	v2.8h, v2.8h, v3.8h
+        uzp1	v4.8h, v4.8h, v5.8h
+        uzp1	v6.8h, v6.8h, v7.8h
+        uzp1	v0.16b, v0.16b, v2.16b
+        uzp1	v4.16b, v4.16b, v6.16b
+        uzp1	v19.16b, v0.16b, v4.16b
+        uzp2	v0.16b, v0.16b, v4.16b
+        sli	v19.16b, v0.16b, #0x4
+        st1	{ v16.16b, v17.16b, v18.16b, v19.16b }, [x0], #64
+        subs	x2, x2, #0x1
+        b.ne	Lpolyw1_pack_32_loop
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(polyw1_pack_32_asm)
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
diff --git a/mldsa/src/native/aarch64/src/polyw1_pack_88_asm.S b/mldsa/src/native/aarch64/src/polyw1_pack_88_asm.S
new file mode 100644
index 000000000..8e3b2b0dc
--- /dev/null
+++ b/mldsa/src/native/aarch64/src/polyw1_pack_88_asm.S
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/aarch64_opt/src/polyw1_pack_88_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm)
+MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm)
+
+        .cfi_startproc
+        ldp	q24, q25, [x2]
+        ldp	q26, q27, [x2, #0x20]
+        mov	x3, #0x4                // =4
+
+Lpolyw1_pack_88_loop:
+        ldp	q0, q1, [x1], #0x100
+        ldp	q2, q3, [x1, #-0xe0]
+        ushl	v0.4s, v0.4s, v24.4s
+        ushl	v1.4s, v1.4s, v24.4s
+        ushl	v2.4s, v2.4s, v24.4s
+        ushl	v3.4s, v3.4s, v24.4s
+        addp	v0.4s, v0.4s, v1.4s
+        addp	v2.4s, v2.4s, v3.4s
+        addp	v16.4s, v0.4s, v2.4s
+        ldp	q0, q1, [x1, #-0xc0]
+        ldp	q2, q3, [x1, #-0xa0]
+        ushl	v0.4s, v0.4s, v24.4s
+        ushl	v1.4s, v1.4s, v24.4s
+        ushl	v2.4s, v2.4s, v24.4s
+        ushl	v3.4s, v3.4s, v24.4s
+        addp	v0.4s, v0.4s, v1.4s
+        addp	v2.4s, v2.4s, v3.4s
+        addp	v17.4s, v0.4s, v2.4s
+        ldp	q0, q1, [x1, #-0x80]
+        ldp	q2, q3, [x1, #-0x60]
+        ushl	v0.4s, v0.4s, v24.4s
+        ushl	v1.4s, v1.4s, v24.4s
+        ushl	v2.4s, v2.4s, v24.4s
+        ushl	v3.4s, v3.4s, v24.4s
+        addp	v0.4s, v0.4s, v1.4s
+        addp	v2.4s, v2.4s, v3.4s
+        addp	v18.4s, v0.4s, v2.4s
+        ldp	q0, q1, [x1, #-0x40]
+        ldp	q2, q3, [x1, #-0x20]
+        ushl	v0.4s, v0.4s, v24.4s
+        ushl	v1.4s, v1.4s, v24.4s
+        ushl	v2.4s, v2.4s, v24.4s
+        ushl	v3.4s, v3.4s, v24.4s
+        addp	v0.4s, v0.4s, v1.4s
+        addp	v2.4s, v2.4s, v3.4s
+        addp	v19.4s, v0.4s, v2.4s
+        tbl	v20.16b, { v16.16b, v17.16b }, v25.16b
+        tbl	v21.16b, { v17.16b, v18.16b }, v26.16b
+        tbl	v22.16b, { v18.16b, v19.16b }, v27.16b
+        st1	{ v20.16b, v21.16b, v22.16b }, [x0], #48
+        subs	x3, x3, #0x1
+        b.ne	Lpolyw1_pack_88_loop
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(polyw1_pack_88_asm)
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
+        */
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 409337fcc..925ac7f4e 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -498,6 +498,57 @@ __contract__(
           || MLD_CONFIG_PARAMETER_SET == 87 */
 #endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */
 
+#if defined(MLD_USE_NATIVE_POLYW1_PACK_32)
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+/*************************************************
+ * Name:        mld_polyw1_pack_32_native
+ *
+ * Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/32.
+ *              Bit-pack polynomial w1 with coefficients in [0, 15],
+ *              packing 2 nibbles per byte.
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *              - const int32_t *a: pointer to input polynomial coefficients
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
+  assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES))
+  ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
+);
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+#endif /* MLD_USE_NATIVE_POLYW1_PACK_32 */
+
+#if defined(MLD_USE_NATIVE_POLYW1_PACK_88)
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/*************************************************
+ * Name:        mld_polyw1_pack_88_native
+ *
+ * Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/88.
+ *              Bit-pack polynomial w1 with coefficients in [0, 43],
+ *              using 6-bit encoding (4 coefficients -> 3 bytes).
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *              - const int32_t *a: pointer to input polynomial coefficients
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
+  assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES))
+  ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
+);
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+#endif /* MLD_USE_NATIVE_POLYW1_PACK_88 */
+
 #if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
 /*************************************************
  * Name:        mld_poly_pointwise_montgomery_native
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 9e45b661e..cfdbf72f5 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -25,6 +25,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_32_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_88_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h
index 956a6e541..9832c202d 100644
--- a/mldsa/src/native/x86_64/src/arith_native_x86_64.h
+++ b/mldsa/src/native/x86_64/src/arith_native_x86_64.h
@@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
 #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
 void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
 
+#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2)
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2)
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a);
+
 #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
 void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
                         const int32_t *qdata);
diff --git a/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c b/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c
new file mode 100644
index 000000000..152a7836b
--- /dev/null
+++ b/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
+
+#include <immintrin.h>
+#include "arith_native_x86_64.h"
+
+/* Pack w1 polynomial (coefficients in [0,15]) for GAMMA2 = (Q-1)/32.
+ * Packs 2 nibbles per byte; 64 coefficients per iteration. */
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a)
+{
+  unsigned int i;
+  const __m256i shift = _mm256_set1_epi16((16 << 8) + 1);
+  const __m256i shufbidx =
+      _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, 15,
+                      14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+
+  for (i = 0; i < MLDSA_N / 64; ++i)
+  {
+    __m256i f0 = _mm256_load_si256((__m256i *)&a[64 * i + 0]);
+    __m256i f1 = _mm256_load_si256((__m256i *)&a[64 * i + 8]);
+    __m256i f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16]);
+    __m256i f3 = _mm256_load_si256((__m256i *)&a[64 * i + 24]);
+    __m256i f4 = _mm256_load_si256((__m256i *)&a[64 * i + 32]);
+    __m256i f5 = _mm256_load_si256((__m256i *)&a[64 * i + 40]);
+    __m256i f6 = _mm256_load_si256((__m256i *)&a[64 * i + 48]);
+    __m256i f7 = _mm256_load_si256((__m256i *)&a[64 * i + 56]);
+    f0 = _mm256_packus_epi32(f0, f1);
+    f1 = _mm256_packus_epi32(f2, f3);
+    f2 = _mm256_packus_epi32(f4, f5);
+    f3 = _mm256_packus_epi32(f6, f7);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f1 = _mm256_packus_epi16(f2, f3);
+    f0 = _mm256_maddubs_epi16(f0, shift);
+    f1 = _mm256_maddubs_epi16(f1, shift);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
+    f0 = _mm256_shuffle_epi8(f0, shufbidx);
+    _mm256_storeu_si256((__m256i *)&r[32 * i], f0);
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+         65 || MLD_CONFIG_PARAMETER_SET == 87) */
+
+MLD_EMPTY_CU(avx2_polyw1_pack_32)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                                \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87)) */
diff --git a/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c b/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c
new file mode 100644
index 000000000..3d1fcb19d
--- /dev/null
+++ b/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     MLD_CONFIG_PARAMETER_SET == 44)
+
+#include <immintrin.h>
+#include "arith_native_x86_64.h"
+
+/* Pack w1 polynomial (coefficients in [0,43]) for GAMMA2 = (Q-1)/88.
+ * 6-bit encoding, 4 coefficients per 3 bytes; 32 coefficients per iteration. */
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a)
+{
+  unsigned int i;
+  const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1);
+  const __m256i shift2 = _mm256_set1_epi32(((1 << 12) << 16) + 1);
+  const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0);
+  const __m256i shufbidx =
+      _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
+                      -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+
+  for (i = 0; i < MLDSA_N / 32; i++)
+  {
+    __m256i f0 = _mm256_load_si256((__m256i *)&a[32 * i + 0]);
+    __m256i f1 = _mm256_load_si256((__m256i *)&a[32 * i + 8]);
+    __m256i f2 = _mm256_load_si256((__m256i *)&a[32 * i + 16]);
+    __m256i f3 = _mm256_load_si256((__m256i *)&a[32 * i + 24]);
+    f0 = _mm256_packus_epi32(f0, f1);
+    f1 = _mm256_packus_epi32(f2, f3);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f0 = _mm256_maddubs_epi16(f0, shift1);
+    f0 = _mm256_madd_epi16(f0, shift2);
+    f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1);
+    f0 = _mm256_shuffle_epi8(f0, shufbidx);
+    f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2);
+
+    /* Each iteration produces 24 valid bytes in the low 192 bits.
+     * Store as 128-bit + 64-bit to avoid writing past the output buffer. */
+    {
+      __m128i lo = _mm256_castsi256_si128(f0);
+      __m128i hi = _mm256_extracti128_si256(f0, 1);
+      _mm_storeu_si128((__m128i *)&r[24 * i], lo);
+      _mm_storel_epi64((__m128i *)&r[24 * i + 16], hi);
+    }
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+         44) */
+
+MLD_EMPTY_CU(avx2_polyw1_pack_88)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                             \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                               \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          44)) */
diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c
index da7c826a4..1dfeb52a3 100644
--- a/mldsa/src/poly_kl.c
+++ b/mldsa/src/poly_kl.c
@@ -888,31 +888,53 @@ void mld_polyz_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYZ_PACKEDBYTES])
 MLD_INTERNAL_API
 void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a)
 {
-  unsigned int i;
-
+#if defined(MLD_USE_NATIVE_POLYW1_PACK_88) && MLD_CONFIG_PARAMETER_SET == 44
+  int ret;
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+  ret = mld_polyw1_pack_88_native(r, a->coeffs);
+  if (ret == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#elif defined(MLD_USE_NATIVE_POLYW1_PACK_32) && \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+  int ret;
   mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+  ret = mld_polyw1_pack_32_native(r, a->coeffs);
+  if (ret == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* !(MLD_USE_NATIVE_POLYW1_PACK_88 && MLD_CONFIG_PARAMETER_SET == 44)  \
+          && MLD_USE_NATIVE_POLYW1_PACK_32 && (MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
+  {
+    unsigned int i;
+
+    mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
 
 #if MLD_CONFIG_PARAMETER_SET == 44
-  for (i = 0; i < MLDSA_N / 4; ++i)
-  __loop__(
+    for (i = 0; i < MLDSA_N / 4; ++i)
+    __loop__(
     invariant(i <= MLDSA_N/4))
-  {
-    r[3 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0]) & 0xFF);
-    r[3 * i + 0] |= (uint8_t)((a->coeffs[4 * i + 1] << 6) & 0xFF);
-    r[3 * i + 1] = (uint8_t)((a->coeffs[4 * i + 1] >> 2) & 0xFF);
-    r[3 * i + 1] |= (uint8_t)((a->coeffs[4 * i + 2] << 4) & 0xFF);
-    r[3 * i + 2] = (uint8_t)((a->coeffs[4 * i + 2] >> 4) & 0xFF);
-    r[3 * i + 2] |= (uint8_t)((a->coeffs[4 * i + 3] << 2) & 0xFF);
-  }
+    {
+      r[3 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0]) & 0xFF);
+      r[3 * i + 0] |= (uint8_t)((a->coeffs[4 * i + 1] << 6) & 0xFF);
+      r[3 * i + 1] = (uint8_t)((a->coeffs[4 * i + 1] >> 2) & 0xFF);
+      r[3 * i + 1] |= (uint8_t)((a->coeffs[4 * i + 2] << 4) & 0xFF);
+      r[3 * i + 2] = (uint8_t)((a->coeffs[4 * i + 2] >> 4) & 0xFF);
+      r[3 * i + 2] |= (uint8_t)((a->coeffs[4 * i + 3] << 2) & 0xFF);
+    }
 #else  /* MLD_CONFIG_PARAMETER_SET == 44 */
-  for (i = 0; i < MLDSA_N / 2; ++i)
-  __loop__(
+    for (i = 0; i < MLDSA_N / 2; ++i)
+    __loop__(
     invariant(i <= MLDSA_N/2))
-  {
-    r[i] =
-        (uint8_t)((a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)) & 0xFF);
-  }
+    {
+      r[i] = (uint8_t)((a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)) &
+                       0xFF);
+    }
 #endif /* MLD_CONFIG_PARAMETER_SET != 44 */
+  }
 }
 
 /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */
diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c
index b6261f156..308555cff 100644
--- a/test/bench/bench_components_mldsa.c
+++ b/test/bench/bench_components_mldsa.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../../mldsa/src/poly.h"
+#include "../../mldsa/src/poly_kl.h"
 #include "../../mldsa/src/polyvec.h"
 #include "../../mldsa/src/randombytes.h"
 #include "hal.h"
@@ -80,6 +81,18 @@ static int bench(void)
         mld_polyvec_matrix_pointwise_montgomery(&polyveck_out, &polymat,
                                                 &polyvecl_b))
 
+  /* polyw1_pack: set up valid input with coefficients in [0, (Q-1)/(2*GAMMA2))
+   */
+  {
+    MLD_ALIGN uint8_t w1_packed[MLDSA_POLYW1_PACKEDBYTES];
+    MLD_ALIGN mld_poly w1_poly;
+    for (i = 0; i < MLDSA_N; i++)
+    {
+      w1_poly.coeffs[i] = (int32_t)(i % ((MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)));
+    }
+    BENCH("polyw1_pack", mld_polyw1_pack(w1_packed, &w1_poly))
+  }
+
   return 0;
 }