diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 29869dd91982d..6ce1432011b47 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -182,6 +182,7 @@ #![feature(hexagon_target_feature)] #![feature(loongarch_target_feature)] #![feature(mips_target_feature)] +#![feature(movrs_target_feature)] #![feature(nvptx_target_feature)] #![feature(powerpc_target_feature)] #![feature(riscv_target_feature)] diff --git a/library/stdarch/.github/workflows/main.yml b/library/stdarch/.github/workflows/main.yml index 0ec355aa3ca4f..3749ed1f6ac81 100644 --- a/library/stdarch/.github/workflows/main.yml +++ b/library/stdarch/.github/workflows/main.yml @@ -8,7 +8,7 @@ jobs: name: Check Style runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust run: rustup update nightly --no-self-update && rustup default nightly - run: ci/style.sh @@ -18,7 +18,7 @@ jobs: needs: [style] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust run: rustup update nightly --no-self-update && rustup default nightly - run: ci/dox.sh @@ -30,7 +30,7 @@ jobs: needs: [style] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust run: rustup update nightly --no-self-update && rustup default nightly - run: cargo test --manifest-path crates/stdarch-verify/Cargo.toml @@ -216,7 +216,7 @@ jobs: build_std: true steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust run: | rustup update nightly --no-self-update @@ -285,7 +285,7 @@ jobs: build_std: true steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust run: | rustup update nightly --no-self-update @@ -310,7 +310,7 @@ jobs: name: Check stdarch-gen-{arm, loongarch, hexagon} output runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust run: rustup update nightly && rustup default nightly && rustup component add rustfmt - name: Check arm spec diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs index b48bdac57e7db..d7295659c3c9a 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/mod.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs @@ -17,6 +17,10 @@ mod mte; #[unstable(feature = "stdarch_aarch64_mte", issue = "129010")] pub use self::mte::*; +mod rand; +#[unstable(feature = "stdarch_aarch64_rand", issue = "153514")] +pub use self::rand::*; + mod neon; #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub use self::neon::*; diff --git a/library/stdarch/crates/core_arch/src/aarch64/mte.rs b/library/stdarch/crates/core_arch/src/aarch64/mte.rs index c400f774bcce0..a5031a45c1a0e 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/mte.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/mte.rs @@ -3,35 +3,17 @@ //! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#markdown-toc-mte-intrinsics) unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.irg" - )] + #[link_name = "llvm.aarch64.irg"] fn irg_(ptr: *const (), exclude: i64) -> *const (); - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.gmi" - )] + #[link_name = "llvm.aarch64.gmi"] fn gmi_(ptr: *const (), exclude: i64) -> i64; - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.ldg" - )] + #[link_name = "llvm.aarch64.ldg"] fn ldg_(ptr: *const (), tag_ptr: *const ()) -> *const (); - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.stg" - )] + #[link_name = "llvm.aarch64.stg"] fn stg_(tagged_ptr: *const (), addr_to_tag: *const ()); - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.addg" - )] + #[link_name = "llvm.aarch64.addg"] fn addg_(ptr: *const (), value: i64) -> *const (); - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.subp" - )] + #[link_name = "llvm.aarch64.subp"] fn subp_(ptr_a: *const (), ptr_b: *const ()) -> i64; } @@ -127,42 +109,46 @@ mod test { use super::*; use stdarch_test::assert_instr; - #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(irg))] // FIXME: MSVC `dumpbin` doesn't support MTE + // Instruction tests are separate because the functions use generics. + // + // FIXME: As of 2026 MSVC `dumpbin` doesn't support MTE. + + #[cfg_attr(not(target_env = "msvc"), assert_instr(irg))] #[allow(dead_code)] #[target_feature(enable = "mte")] unsafe fn test_arm_mte_create_random_tag(src: *const (), mask: u64) -> *const () { __arm_mte_create_random_tag(src, mask) } - #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(addg))] + #[cfg_attr(not(target_env = "msvc"), assert_instr(addg))] #[allow(dead_code)] #[target_feature(enable = "mte")] unsafe fn test_arm_mte_increment_tag(src: *const ()) -> *const () { __arm_mte_increment_tag::<1, _>(src) } - #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(gmi))] + #[cfg_attr(not(target_env = "msvc"), assert_instr(gmi))] #[allow(dead_code)] #[target_feature(enable = "mte")] unsafe fn test_arm_mte_exclude_tag(src: *const (), excluded: u64) -> u64 { __arm_mte_exclude_tag(src, excluded) } - #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(stg))] + #[cfg_attr(not(target_env = "msvc"), assert_instr(stg))] #[allow(dead_code)] #[target_feature(enable = "mte")] unsafe fn test_arm_mte_set_tag(src: *const ()) { __arm_mte_set_tag(src) } - #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(ldg))] + #[cfg_attr(not(target_env = "msvc"), assert_instr(ldg))] #[allow(dead_code)] #[target_feature(enable = "mte")] unsafe fn test_arm_mte_get_tag(src: *const ()) -> *const () { __arm_mte_get_tag(src) } - #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(subp))] + #[cfg_attr(not(target_env = "msvc"), assert_instr(subp))] #[allow(dead_code)] #[target_feature(enable = "mte")] unsafe fn test_arm_mte_ptrdiff(a: *const (), b: *const ()) -> i64 { diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs index 490f04020ee43..74af50016690b 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs @@ -13532,7 +13532,14 @@ pub fn vmaxh_f16(a: f16, b: f16) -> f16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fmaxnm))] pub fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { - unsafe { simd_fmax(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.v1f64" + )] + fn _vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; + } + unsafe { _vmaxnm_f64(a, b) } } #[doc = "Floating-point Maximum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f64)"] @@ -13541,7 +13548,14 @@ pub fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fmaxnm))] pub fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { - unsafe { simd_fmax(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.v2f64" + )] + fn _vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; + } + unsafe { _vmaxnmq_f64(a, b) } } #[doc = "Floating-point Maximum Number"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmh_f16)"] @@ -13551,7 +13565,14 @@ pub fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fmaxnm))] pub fn vmaxnmh_f16(a: f16, b: f16) -> f16 { - f16::max(a, b) + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.f16" + )] + fn _vmaxnmh_f16(a: f16, b: f16) -> f16; + } + unsafe { _vmaxnmh_f16(a, b) } } #[doc = "Floating-point maximum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmv_f16)"] @@ -13561,7 +13582,14 @@ pub fn vmaxnmh_f16(a: f16, b: f16) -> f16 { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fmaxnmv))] pub fn vmaxnmv_f16(a: float16x4_t) -> f16 { - unsafe { simd_reduce_max(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnmv.f16.v4f16" + )] + fn _vmaxnmv_f16(a: float16x4_t) -> f16; + } + unsafe { _vmaxnmv_f16(a) } } #[doc = "Floating-point maximum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f16)"] @@ -13571,7 +13599,14 @@ pub fn vmaxnmv_f16(a: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fmaxnmv))] pub fn vmaxnmvq_f16(a: float16x8_t) -> f16 { - unsafe { simd_reduce_max(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnmv.f16.v8f16" + )] + fn _vmaxnmvq_f16(a: float16x8_t) -> f16; + } + unsafe { _vmaxnmvq_f16(a) } } #[doc = "Floating-point maximum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmv_f32)"] @@ -13580,7 +13615,14 @@ pub fn vmaxnmvq_f16(a: float16x8_t) -> f16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fmaxnmp))] pub fn vmaxnmv_f32(a: float32x2_t) -> f32 { - unsafe { simd_reduce_max(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnmv.f32.v2f32" + )] + fn _vmaxnmv_f32(a: float32x2_t) -> f32; + } + unsafe { _vmaxnmv_f32(a) } } #[doc = "Floating-point maximum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f64)"] @@ -13589,7 +13631,14 @@ pub fn vmaxnmv_f32(a: float32x2_t) -> f32 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fmaxnmp))] pub fn vmaxnmvq_f64(a: float64x2_t) -> f64 { - unsafe { simd_reduce_max(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnmv.f64.v2f64" + )] + fn _vmaxnmvq_f64(a: float64x2_t) -> f64; + } + unsafe { _vmaxnmvq_f64(a) } } #[doc = "Floating-point maximum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f32)"] @@ -13598,7 +13647,14 @@ pub fn vmaxnmvq_f64(a: float64x2_t) -> f64 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fmaxnmv))] pub fn vmaxnmvq_f32(a: float32x4_t) -> f32 { - unsafe { simd_reduce_max(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnmv.f32.v4f32" + )] + fn _vmaxnmvq_f32(a: float32x4_t) -> f32; + } + unsafe { _vmaxnmvq_f32(a) } } #[doc = "Floating-point maximum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_f16)"] @@ -13846,7 +13902,14 @@ pub fn vminh_f16(a: f16, b: f16) -> f16 { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fminnm))] pub fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { - unsafe { simd_fmin(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.v1f64" + )] + fn _vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; + } + unsafe { _vminnm_f64(a, b) } } #[doc = "Floating-point Minimum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f64)"] @@ -13855,7 +13918,14 @@ pub fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(fminnm))] pub fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { - unsafe { simd_fmin(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.v2f64" + )] + fn _vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; + } + unsafe { _vminnmq_f64(a, b) } } #[doc = "Floating-point Minimum Number"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmh_f16)"] @@ -13865,7 +13935,14 @@ pub fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fminnm))] pub fn vminnmh_f16(a: f16, b: f16) -> f16 { - f16::min(a, b) + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.f16" + )] + fn _vminnmh_f16(a: f16, b: f16) -> f16; + } + unsafe { _vminnmh_f16(a, b) } } #[doc = "Floating-point minimum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmv_f16)"] @@ -13875,7 +13952,14 @@ pub fn vminnmh_f16(a: f16, b: f16) -> f16 { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fminnmv))] pub fn vminnmv_f16(a: float16x4_t) -> f16 { - unsafe { simd_reduce_min(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnmv.f16.v4f16" + )] + fn _vminnmv_f16(a: float16x4_t) -> f16; + } + unsafe { _vminnmv_f16(a) } } #[doc = "Floating-point minimum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f16)"] @@ -13885,7 +13969,14 @@ pub fn vminnmv_f16(a: float16x4_t) -> f16 { #[cfg(not(target_arch = "arm64ec"))] #[cfg_attr(test, assert_instr(fminnmv))] pub fn vminnmvq_f16(a: float16x8_t) -> f16 { - unsafe { simd_reduce_min(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnmv.f16.v8f16" + )] + fn _vminnmvq_f16(a: float16x8_t) -> f16; + } + unsafe { _vminnmvq_f16(a) } } #[doc = "Floating-point minimum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmv_f32)"] @@ -13894,7 +13985,14 @@ pub fn vminnmvq_f16(a: float16x8_t) -> f16 { #[cfg_attr(test, assert_instr(fminnmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vminnmv_f32(a: float32x2_t) -> f32 { - unsafe { simd_reduce_min(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnmv.f32.v2f32" + )] + fn _vminnmv_f32(a: float32x2_t) -> f32; + } + unsafe { _vminnmv_f32(a) } } #[doc = "Floating-point minimum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f64)"] @@ -13903,7 +14001,14 @@ pub fn vminnmv_f32(a: float32x2_t) -> f32 { #[cfg_attr(test, assert_instr(fminnmp))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vminnmvq_f64(a: float64x2_t) -> f64 { - unsafe { simd_reduce_min(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnmv.f64.v2f64" + )] + fn _vminnmvq_f64(a: float64x2_t) -> f64; + } + unsafe { _vminnmvq_f64(a) } } #[doc = "Floating-point minimum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f32)"] @@ -13912,7 +14017,14 @@ pub fn vminnmvq_f64(a: float64x2_t) -> f64 { #[cfg_attr(test, assert_instr(fminnmv))] #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub fn vminnmvq_f32(a: float32x4_t) -> f32 { - unsafe { simd_reduce_min(a) } + unsafe extern "unadjusted" { + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnmv.f32.v4f32" + )] + fn _vminnmvq_f32(a: float32x4_t) -> f32; + } + unsafe { _vminnmvq_f32(a) } } #[doc = "Floating-point minimum number across vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_f16)"] diff --git a/library/stdarch/crates/core_arch/src/aarch64/rand.rs b/library/stdarch/crates/core_arch/src/aarch64/rand.rs new file mode 100644 index 0000000000000..3f52cf2ce8657 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/aarch64/rand.rs @@ -0,0 +1,48 @@ +//! AArch64 Random Number intrinsics +//! +//! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#random-number-generation-intrinsics) + +#[cfg(test)] +use stdarch_test::assert_instr; + +unsafe extern "unadjusted" { + #[link_name = "llvm.aarch64.rndr"] + fn rndr_() -> Tuple; + + #[link_name = "llvm.aarch64.rndrrs"] + fn rndrrs_() -> Tuple; +} + +#[repr(C)] +struct Tuple { + bits: u64, + status: bool, +} + +/// Stores a 64-bit random number into the object pointed to by the argument and returns +/// zero. If the implementation could not generate a random number within a reasonable +/// period of time the object pointed to by the input is set to zero and a non-zero value +/// is returned. +#[inline] +#[target_feature(enable = "rand")] +#[cfg_attr(test, assert_instr(mrs))] +#[unstable(feature = "stdarch_aarch64_rand", issue = "153514")] +pub unsafe fn __rndr(value: *mut u64) -> i32 { + let Tuple { bits, status } = rndr_(); + unsafe { *value = bits }; + status as i32 +} + +/// Reseeds the random number generator. After that stores a 64-bit random number into +/// the object pointed to by the argument and returns zero. If the implementation could +/// not generate a random number within a reasonable period of time the object pointed +/// to by the input is set to zero and a non-zero value is returned. +#[inline] +#[target_feature(enable = "rand")] +#[cfg_attr(test, assert_instr(mrs))] +#[unstable(feature = "stdarch_aarch64_rand", issue = "153514")] +pub unsafe fn __rndrrs(value: *mut u64) -> i32 { + let Tuple { bits, status } = rndrrs_(); + unsafe { *value = bits }; + status as i32 +} diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs index a7c33917a88cc..4a846e2877462 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs @@ -25891,7 +25891,15 @@ pub fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { )] #[cfg(not(target_arch = "arm64ec"))] pub fn vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { - unsafe { simd_fmax(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f16")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.v4f16" + )] + fn _vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t; + } + unsafe { _vmaxnm_f16(a, b) } } #[doc = "Floating-point Maximum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f16)"] @@ -25913,7 +25921,15 @@ pub fn vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { )] #[cfg(not(target_arch = "arm64ec"))] pub fn vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { - unsafe { simd_fmax(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v8f16")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.v8f16" + )] + fn _vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t; + } + unsafe { _vmaxnmq_f16(a, b) } } #[doc = "Floating-point Maximum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnm_f32)"] @@ -25934,7 +25950,15 @@ pub fn vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { - unsafe { simd_fmax(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.v2f32" + )] + fn _vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; + } + unsafe { _vmaxnm_f32(a, b) } } #[doc = "Floating-point Maximum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f32)"] @@ -25955,7 +25979,15 @@ pub fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { - unsafe { simd_fmax(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fmaxnm.v4f32" + )] + fn _vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; + } + unsafe { _vmaxnmq_f32(a, b) } } #[doc = "Minimum (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f16)"] @@ -26383,7 +26415,15 @@ pub fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { )] #[cfg(not(target_arch = "arm64ec"))] pub fn vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { - unsafe { simd_fmin(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f16")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.v4f16" + )] + fn _vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t; + } + unsafe { _vminnm_f16(a, b) } } #[doc = "Floating-point Minimum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f16)"] @@ -26405,7 +26445,15 @@ pub fn vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t { )] #[cfg(not(target_arch = "arm64ec"))] pub fn vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { - unsafe { simd_fmin(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v8f16")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.v8f16" + )] + fn _vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t; + } + unsafe { _vminnmq_f16(a, b) } } #[doc = "Floating-point Minimum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnm_f32)"] @@ -26426,7 +26474,15 @@ pub fn vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { - unsafe { simd_fmin(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.v2f32" + )] + fn _vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; + } + unsafe { _vminnm_f32(a, b) } } #[doc = "Floating-point Minimum Number (vector)"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f32)"] @@ -26447,7 +26503,15 @@ pub fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] pub fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { - unsafe { simd_fmin(a, b) } + unsafe extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")] + #[cfg_attr( + any(target_arch = "aarch64", target_arch = "arm64ec"), + link_name = "llvm.aarch64.neon.fminnm.v4f32" + )] + fn _vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; + } + unsafe { _vminnmq_f32(a, b) } } #[doc = "Floating-point multiply-add to accumulator"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_f32)"] diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs index 8a1bead7c4791..9255994e5ee81 100644 --- a/library/stdarch/crates/core_arch/src/lib.rs +++ b/library/stdarch/crates/core_arch/src/lib.rs @@ -39,7 +39,8 @@ const_trait_impl, const_cmp, const_eval_select, - maybe_uninit_as_bytes + maybe_uninit_as_bytes, + movrs_target_feature )] #![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))] #![deny(clippy::missing_inline_in_public_items)] diff --git a/library/stdarch/crates/core_arch/src/s390x/vector.rs b/library/stdarch/crates/core_arch/src/s390x/vector.rs index 346cd674df665..376c912c04090 100644 --- a/library/stdarch/crates/core_arch/src/s390x/vector.rs +++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs @@ -335,6 +335,20 @@ unsafe extern "unadjusted" { #[link_name = "llvm.s390.vcfn"] fn vcfn(a: vector_signed_short, immarg: i32) -> vector_signed_short; #[link_name = "llvm.s390.vcnf"] fn vcnf(a: vector_signed_short, immarg: i32) -> vector_signed_short; #[link_name = "llvm.s390.vcrnfs"] fn vcrnfs(a: vector_float, b: vector_float, immarg: i32) -> vector_signed_short; + + // These are the intrinsics we'd like to use (with mode 0). However, they require + // "vector-enhancements-1" and don't have a fallback, whereas `vec_min`/`vec_max` should be + // available with just "vector". Therefore, we cannot use them. + // #[link_name = "llvm.s390.vfmaxsb"] fn vfmaxsb(a: vector_float, b: vector_float, mode: i32) -> vector_float; + // #[link_name = "llvm.s390.vfmaxdb"] fn vfmaxdb(a: vector_double, b: vector_double, mode: i32) -> vector_double; + // #[link_name = "llvm.s390.vfminsb"] fn vfminsb(a: vector_float, b: vector_float, mode: i32) -> vector_float; + // #[link_name = "llvm.s390.vfmindb"] fn vfmindb(a: vector_double, b: vector_double, mode: i32) -> vector_double; + // Instead, we use "portable" LLVM intrinsics -- even though those have the wrong semantics + // (https://github.com/rust-lang/stdarch/issues/2060), they usually do the right thing. + #[link_name = "llvm.minnum.v4f32"] fn minnum_v4f32(a: vector_float, b: vector_float) -> vector_float; + #[link_name = "llvm.minnum.v2f64"] fn minnum_v2f64(a: vector_double, b: vector_double) -> vector_double; + #[link_name = "llvm.maxnum.v4f32"] fn maxnum_v4f32(a: vector_float, b: vector_float) -> vector_float; + #[link_name = "llvm.maxnum.v2f64"] fn maxnum_v2f64(a: vector_double, b: vector_double) -> vector_double; } #[repr(simd)] @@ -780,8 +794,8 @@ mod sealed { impl_max!(vec_vmxslg, vector_unsigned_long_long, vmxlg); } - test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [simd_fmax, "vector-enhancements-1" vfmaxsb ] } - test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [simd_fmax, "vector-enhancements-1" vfmaxdb] } + test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [maxnum_v4f32, "vector-enhancements-1" vfmaxsb] } + test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [maxnum_v2f64, "vector-enhancements-1" vfmaxdb] } impl_vec_trait!([VectorMax vec_max] vec_vfmaxsb (vector_float, vector_float) -> vector_float); impl_vec_trait!([VectorMax vec_max] vec_vfmaxdb (vector_double, vector_double) -> vector_double); @@ -827,8 +841,8 @@ mod sealed { impl_min!(vec_vmnslg, vector_unsigned_long_long, vmnlg); } - test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [simd_fmin, "vector-enhancements-1" vfminsb] } - test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [simd_fmin, "vector-enhancements-1" vfmindb] } + test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [minnum_v4f32, "vector-enhancements-1" vfminsb] } + test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [minnum_v2f64, "vector-enhancements-1" vfmindb] } impl_vec_trait!([VectorMin vec_min] vec_vfminsb (vector_float, vector_float) -> vector_float); impl_vec_trait!([VectorMin vec_min] vec_vfmindb (vector_double, vector_double) -> vector_double); @@ -7477,6 +7491,30 @@ mod tests { [0, !0, !0, !0] } + test_vec_2! { test_vec_max_f32, vec_max, f32x4, f32x4 -> f32x4, + [1.0, f32::NAN, f32::INFINITY, 2.0], + [-10.0, -10.0, 5.0, f32::NAN], + [1.0, -10.0, f32::INFINITY, 2.0] + } + + test_vec_2! { test_vec_min_f32, vec_min, f32x4, f32x4 -> f32x4, + [1.0, f32::NAN, f32::INFINITY, 2.0], + [-10.0, -10.0, 5.0, f32::NAN], + [-10.0, -10.0, 5.0, 2.0] + } + + test_vec_2! { test_vec_max_f64, vec_max, f64x2, f64x2 -> f64x2, + [f64::NAN, 2.0], + [-10.0, f64::NAN], + [-10.0, 2.0] + } + + test_vec_2! { test_vec_min_f64, vec_min, f64x2, f64x2 -> f64x2, + [f64::NAN, 2.0], + [-10.0, f64::NAN], + [-10.0, 2.0] + } + #[simd_test(enable = "vector")] fn test_vec_meadd() { let a = vector_unsigned_short([1, 0, 2, 0, 3, 0, 4, 0]); diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index b41f8576cfe54..659d6c3be88e7 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -11753,7 +11753,7 @@ pub const fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i { #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i { unsafe { simd_cast::<_, i8x32>(simd_imax( @@ -11771,7 +11771,7 @@ pub const fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i { #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i { unsafe { simd_select_bitmask(k, _mm512_cvtsepi16_epi8(a).as_i8x32(), src.as_i8x32()).as_m256i() @@ -11785,7 +11785,7 @@ pub const fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i { unsafe { simd_select_bitmask(k, _mm512_cvtsepi16_epi8(a).as_i8x32(), i8x32::ZERO).as_m256i() } } @@ -11797,7 +11797,7 @@ pub const fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i { #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i { unsafe { simd_cast::<_, i8x16>(simd_imax( @@ -11815,7 +11815,7 @@ pub const fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i { #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i { unsafe { simd_select_bitmask(k, _mm256_cvtsepi16_epi8(a).as_i8x16(), src.as_i8x16()).as_m128i() @@ -11829,7 +11829,7 @@ pub const fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i { unsafe { simd_select_bitmask(k, _mm256_cvtsepi16_epi8(a).as_i8x16(), i8x16::ZERO).as_m128i() } } @@ -11874,7 +11874,7 @@ pub fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i { #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovuswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i { unsafe { simd_cast::<_, u8x32>(simd_imin(a.as_u16x32(), u16x32::splat(u8::MAX as _))).as_m256i() @@ -11888,7 +11888,7 @@ pub const fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i { #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovuswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i { unsafe { simd_select_bitmask(k, _mm512_cvtusepi16_epi8(a).as_u8x32(), src.as_u8x32()).as_m256i() @@ -11902,7 +11902,7 @@ pub const fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovuswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i { unsafe { simd_select_bitmask(k, _mm512_cvtusepi16_epi8(a).as_u8x32(), u8x32::ZERO).as_m256i() } } @@ -11914,7 +11914,7 @@ pub const fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i { #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovuswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i { unsafe { simd_cast::<_, u8x16>(simd_imin(a.as_u16x16(), u16x16::splat(u8::MAX as _))).as_m128i() @@ -11928,7 +11928,7 @@ pub const fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i { #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovuswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i { unsafe { simd_select_bitmask(k, _mm256_cvtusepi16_epi8(a).as_u8x16(), src.as_u8x16()).as_m128i() @@ -11942,7 +11942,7 @@ pub const fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovuswb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i { unsafe { simd_select_bitmask(k, _mm256_cvtusepi16_epi8(a).as_u8x16(), u8x16::ZERO).as_m128i() } } @@ -12678,7 +12678,7 @@ pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovwb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) { let result = _mm512_cvtepi16_epi8(a).as_i8x32(); let mask = simd_select_bitmask(k, i8x32::splat(!0), i8x32::ZERO); @@ -12692,7 +12692,7 @@ pub const unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mma #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovwb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) { let result = _mm256_cvtepi16_epi8(a).as_i8x16(); let mask = simd_select_bitmask(k, i8x16::splat(!0), i8x16::ZERO); @@ -12706,7 +12706,7 @@ pub const unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mma #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmovwb))] -#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")] +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] pub const unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) { let result: i8x8 = simd_shuffle!( _mm_cvtepi16_epi8(a).as_i8x16(), diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs index 49b790b151049..8cd8764f24868 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs @@ -12,7 +12,7 @@ use stdarch_test::assert_instr; #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssd))] pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) } + unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i16x32(), b.as_i16x32())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -51,7 +51,7 @@ pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssd))] pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. @@ -62,7 +62,7 @@ pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssd))] pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -101,7 +101,7 @@ pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m25 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssd))] pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. @@ -112,7 +112,7 @@ pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssd))] pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -151,7 +151,7 @@ pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssds))] pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) } + unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i16x32(), b.as_i16x32())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -190,7 +190,7 @@ pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssds))] pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. @@ -201,7 +201,7 @@ pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssds))] pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -240,7 +240,7 @@ pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssds))] pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. @@ -251,7 +251,7 @@ pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpwssds))] pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -290,7 +290,7 @@ pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusd))] pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) } + unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_u8x64(), b.as_i8x64())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -329,7 +329,7 @@ pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusd))] pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. @@ -340,7 +340,7 @@ pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusd))] pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -379,7 +379,7 @@ pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m25 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusd))] pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. @@ -390,7 +390,7 @@ pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusd))] pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -429,7 +429,7 @@ pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusds))] pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) } + unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_u8x64(), b.as_i8x64())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -468,7 +468,7 @@ pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusds))] pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. @@ -479,7 +479,7 @@ pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusds))] pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -518,7 +518,7 @@ pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusds))] pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. @@ -529,7 +529,7 @@ pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpdpbusds))] pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -570,7 +570,7 @@ pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i #[cfg_attr(test, assert_instr(vpdpbssd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i8x16(), b.as_i8x16())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit @@ -583,7 +583,7 @@ pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpbssd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i8x32(), b.as_i8x32())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit @@ -596,7 +596,7 @@ pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpbssds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i8x16(), b.as_i8x16())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit @@ -609,7 +609,7 @@ pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpbssds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i8x32(), b.as_i8x32())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit @@ -622,7 +622,7 @@ pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpbsud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i8x16(), b.as_u8x16())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit @@ -635,7 +635,7 @@ pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpbsud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i8x32(), b.as_u8x32())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit @@ -648,7 +648,7 @@ pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpbsuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i8x16(), b.as_u8x16())) } } /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit @@ -661,7 +661,7 @@ pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpbsuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i8x32(), b.as_u8x32())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit @@ -674,7 +674,7 @@ pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpbuud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_u8x16(), b.as_u8x16())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit @@ -687,7 +687,7 @@ pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpbuud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_u8x32(), b.as_u8x32())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit @@ -700,7 +700,7 @@ pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpbuuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_u8x16(), b.as_u8x16())) } } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit @@ -713,7 +713,7 @@ pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpbuuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_u8x32(), b.as_u8x32())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit @@ -726,7 +726,7 @@ pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpwsud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i16x8(), b.as_u16x8())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit @@ -739,7 +739,7 @@ pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpwsud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i16x16(), b.as_u16x16())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit @@ -752,7 +752,7 @@ pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpwsuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i16x8(), b.as_u16x8())) } } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit @@ -765,7 +765,7 @@ pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpwsuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i16x16(), b.as_u16x16())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit @@ -778,7 +778,7 @@ pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpwusd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_u16x8(), b.as_i16x8())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit @@ -791,7 +791,7 @@ pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpwusd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_u16x16(), b.as_i16x16())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit @@ -804,7 +804,7 @@ pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpwusds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_u16x8(), b.as_i16x8())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit @@ -817,7 +817,7 @@ pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpwusds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_u16x16(), b.as_i16x16())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit @@ -830,7 +830,7 @@ pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpwuud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_u16x8(), b.as_u16x8())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit @@ -843,7 +843,7 @@ pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpwuud))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_u16x16(), b.as_u16x16())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit @@ -856,7 +856,7 @@ pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpdpwuuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) } + unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_u16x8(), b.as_u16x8())) } } /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit @@ -869,98 +869,98 @@ pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpdpwuuds))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) } + unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_u16x16(), b.as_u16x16())) } } #[allow(improper_ctypes)] unsafe extern "C" { #[link_name = "llvm.x86.avx512.vpdpwssd.512"] - fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + fn vpdpwssd(src: i32x16, a: i16x32, b: i16x32) -> i32x16; #[link_name = "llvm.x86.avx512.vpdpwssd.256"] - fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwssd256(src: i32x8, a: i16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx512.vpdpwssd.128"] - fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwssd128(src: i32x4, a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.avx512.vpdpwssds.512"] - fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + fn vpdpwssds(src: i32x16, a: i16x32, b: i16x32) -> i32x16; #[link_name = "llvm.x86.avx512.vpdpwssds.256"] - fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwssds256(src: i32x8, a: i16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx512.vpdpwssds.128"] - fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwssds128(src: i32x4, a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.avx512.vpdpbusd.512"] - fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + fn vpdpbusd(src: i32x16, a: u8x64, b: i8x64) -> i32x16; #[link_name = "llvm.x86.avx512.vpdpbusd.256"] - fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbusd256(src: i32x8, a: u8x32, b: i8x32) -> i32x8; #[link_name = "llvm.x86.avx512.vpdpbusd.128"] - fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbusd128(src: i32x4, a: u8x16, b: i8x16) -> i32x4; #[link_name = "llvm.x86.avx512.vpdpbusds.512"] - fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + fn vpdpbusds(src: i32x16, a: u8x64, b: i8x64) -> i32x16; #[link_name = "llvm.x86.avx512.vpdpbusds.256"] - fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbusds256(src: i32x8, a: u8x32, b: i8x32) -> i32x8; #[link_name = "llvm.x86.avx512.vpdpbusds.128"] - fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbusds128(src: i32x4, a: u8x16, b: i8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbssd.128"] - fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbssd_128(src: i32x4, a: i8x16, b: i8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbssd.256"] - fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbssd_256(src: i32x8, a: i8x32, b: i8x32) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpbssds.128"] - fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbssds_128(src: i32x4, a: i8x16, b: i8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbssds.256"] - fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbssds_256(src: i32x8, a: i8x32, b: i8x32) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpbsud.128"] - fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbsud_128(src: i32x4, a: i8x16, b: u8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbsud.256"] - fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbsud_256(src: i32x8, a: i8x32, b: u8x32) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpbsuds.128"] - fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbsuds_128(src: i32x4, a: i8x16, b: u8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbsuds.256"] - fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbsuds_256(src: i32x8, a: i8x32, b: u8x32) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpbuud.128"] - fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbuud_128(src: i32x4, a: u8x16, b: u8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbuud.256"] - fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbuud_256(src: i32x8, a: u8x32, b: u8x32) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpbuuds.128"] - fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpbuuds_128(src: i32x4, a: u8x16, b: u8x16) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpbuuds.256"] - fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpbuuds_256(src: i32x8, a: u8x32, b: u8x32) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpwsud.128"] - fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwsud_128(src: i32x4, a: i16x8, b: u16x8) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpwsud.256"] - fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwsud_256(src: i32x8, a: i16x16, b: u16x16) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpwsuds.128"] - fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwsuds_128(src: i32x4, a: i16x8, b: u16x8) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpwsuds.256"] - fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwsuds_256(src: i32x8, a: i16x16, b: u16x16) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpwusd.128"] - fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwusd_128(src: i32x4, a: u16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpwusd.256"] - fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwusd_256(src: i32x8, a: u16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpwusds.128"] - fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwusds_128(src: i32x4, a: u16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpwusds.256"] - fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwusds_256(src: i32x8, a: u16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpwuud.128"] - fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwuud_128(src: i32x4, a: u16x8, b: u16x8) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpwuud.256"] - fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwuud_256(src: i32x8, a: u16x16, b: u16x16) -> i32x8; #[link_name = "llvm.x86.avx2.vpdpwuuds.128"] - fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + fn vpdpwuuds_128(src: i32x4, a: u16x8, b: u16x8) -> i32x4; #[link_name = "llvm.x86.avx2.vpdpwuuds.256"] - fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + fn vpdpwuuds_256(src: i32x8, a: u16x16, b: u16x16) -> i32x8; } #[cfg(test)] diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs index 9396507f08045..68a963f65b7d4 100644 --- a/library/stdarch/crates/core_arch/src/x86/mod.rs +++ b/library/stdarch/crates/core_arch/src/x86/mod.rs @@ -774,3 +774,7 @@ pub use self::avx512fp16::*; mod kl; #[stable(feature = "keylocker_x86", since = "1.89.0")] pub use self::kl::*; + +mod movrs; +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub use self::movrs::*; diff --git a/library/stdarch/crates/core_arch/src/x86/movrs.rs b/library/stdarch/crates/core_arch/src/x86/movrs.rs new file mode 100644 index 0000000000000..d5f4d146c44aa --- /dev/null +++ b/library/stdarch/crates/core_arch/src/x86/movrs.rs @@ -0,0 +1,23 @@ +//! Read-shared move intrinsics + +#[cfg(test)] +use stdarch_test::assert_instr; + +unsafe extern "unadjusted" { + #[link_name = "llvm.x86.prefetchrs"] + fn prefetchrs(p: *const u8); +} + +/// Prefetches the cache line that contains address `p`, with an indication that the source memory +/// location is likely to become read-shared by multiple processors, i.e., read in the future by at +/// least one other processor before it is written, assuming it is ever written in the future. +/// +/// Note: this intrinsic is safe to use even though it takes a raw pointer argument. In general, this +/// cannot change the behavior of the program, including not trapping on invalid pointers. +#[inline] +#[target_feature(enable = "movrs")] +#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(prefetchrst2))] +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub fn _m_prefetchrs(p: *const u8) { + unsafe { prefetchrs(p) } +} diff --git a/library/stdarch/crates/core_arch/src/x86_64/amx.rs b/library/stdarch/crates/core_arch/src/x86_64/amx.rs index 4e20e014cf20a..03bbe3e449258 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/amx.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/amx.rs @@ -398,6 +398,22 @@ pub unsafe fn _tile_cvtrowd2ps(row: u32) -> __m512 { tcvtrowd2ps(TILE as i8, row).as_m512() } +/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer +/// elements to packed single-precision (32-bit) floating-point elements. +#[inline] +#[rustc_legacy_const_generics(0, 1)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tcvtrowd2ps, TILE = 0, ROW = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_cvtrowd2psi() -> __m512 { + static_assert_uimm_bits!(TILE, 3); + static_assert_uimm_bits!(ROW, 6); + tcvtrowd2psi(TILE as i8, ROW as u32).as_m512() +} + /// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit) /// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting /// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector. @@ -414,6 +430,23 @@ pub unsafe fn _tile_cvtrowps2phh(row: u32) -> __m512h { tcvtrowps2phh(TILE as i8, row).as_m512h() } +/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit) +/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting +/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector. +#[inline] +#[rustc_legacy_const_generics(0, 1)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tcvtrowps2phh, TILE = 0, ROW = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_cvtrowps2phhi() -> __m512h { + static_assert_uimm_bits!(TILE, 3); + static_assert_uimm_bits!(ROW, 6); + tcvtrowps2phhi(TILE as i8, ROW as u32).as_m512h() +} + /// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit) /// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting /// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector. @@ -430,6 +463,23 @@ pub unsafe fn _tile_cvtrowps2phl(row: u32) -> __m512h { tcvtrowps2phl(TILE as i8, row).as_m512h() } +/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit) +/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting +/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector. +#[inline] +#[rustc_legacy_const_generics(0, 1)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tcvtrowps2phl, TILE = 0, ROW = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_cvtrowps2phli() -> __m512h { + static_assert_uimm_bits!(TILE, 3); + static_assert_uimm_bits!(ROW, 6); + tcvtrowps2phli(TILE as i8, ROW as u32).as_m512h() +} + /// Moves one row of tile data into a zmm vector register #[inline] #[rustc_legacy_const_generics(0)] @@ -444,6 +494,21 @@ pub unsafe fn _tile_movrow(row: u32) -> __m512i { tilemovrow(TILE as i8, row).as_m512i() } +/// Moves one row of tile data into a zmm vector register +#[inline] +#[rustc_legacy_const_generics(0, 1)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tilemovrow, TILE = 0, ROW = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_movrowi() -> __m512i { + static_assert_uimm_bits!(TILE, 3); + static_assert_uimm_bits!(ROW, 6); + tilemovrowi(TILE as i8, ROW as u32).as_m512i() +} + #[allow(improper_ctypes)] unsafe extern "C" { #[link_name = "llvm.x86.ldtilecfg"] @@ -492,12 +557,20 @@ unsafe extern "C" { fn tmmultf32ps(dst: i8, a: i8, b: i8); #[link_name = "llvm.x86.tcvtrowd2ps"] fn tcvtrowd2ps(tile: i8, row: u32) -> f32x16; + #[link_name = "llvm.x86.tcvtrowd2psi"] + fn tcvtrowd2psi(tile: i8, row: u32) -> f32x16; #[link_name = "llvm.x86.tcvtrowps2phh"] fn tcvtrowps2phh(tile: i8, row: u32) -> f16x32; + #[link_name = "llvm.x86.tcvtrowps2phhi"] + fn tcvtrowps2phhi(tile: i8, row: u32) -> f16x32; #[link_name = "llvm.x86.tcvtrowps2phl"] fn tcvtrowps2phl(tile: i8, row: u32) -> f16x32; + #[link_name = "llvm.x86.tcvtrowps2phli"] + fn tcvtrowps2phli(tile: i8, row: u32) -> f16x32; #[link_name = "llvm.x86.tilemovrow"] fn tilemovrow(tile: i8, row: u32) -> i32x16; + #[link_name = "llvm.x86.tilemovrowi"] + fn tilemovrowi(tile: i8, row: u32) -> i32x16; } #[cfg(test)] @@ -1032,6 +1105,50 @@ mod tests { } } + macro_rules! wrap_imm4 { + ($name:ident :: <$TILE:literal>, $row:expr) => { + match $row { + 0 => $name::<$TILE, 0>(), + 1 => $name::<$TILE, 1>(), + 2 => $name::<$TILE, 2>(), + 3 => $name::<$TILE, 3>(), + 4 => $name::<$TILE, 4>(), + 5 => $name::<$TILE, 5>(), + 6 => $name::<$TILE, 6>(), + 7 => $name::<$TILE, 7>(), + 8 => $name::<$TILE, 8>(), + 9 => $name::<$TILE, 9>(), + 10 => $name::<$TILE, 10>(), + 11 => $name::<$TILE, 11>(), + 12 => $name::<$TILE, 12>(), + 13 => $name::<$TILE, 13>(), + 14 => $name::<$TILE, 14>(), + 15 => $name::<$TILE, 15>(), + _ => panic!("row index out of range"), + } + }; + } + + #[simd_test(enable = "amx-avx512,avx10.2")] + fn test_tile_movrowi() { + unsafe { + _init_amx(); + let array: [[u8; 64]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + + for i in 0..16 { + let row = wrap_imm4!(_tile_movrowi::<0>, i); + assert_eq!(*row.as_u8x64().as_array(), [i as _; _]); + } + } + } + #[simd_test(enable = "amx-avx512,avx10.2")] fn test_tile_cvtrowd2ps() { unsafe { @@ -1051,6 +1168,26 @@ mod tests { } } + #[simd_test(enable = "amx-avx512,avx10.2")] + fn test_tile_cvtrowd2psi() { + unsafe { + _init_amx(); + let array: [[u32; 16]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + + for i in 0..16 { + let row = wrap_imm4!(_tile_cvtrowd2psi::<0>, i); + assert_eq!(*row.as_f32x16().as_array(), [i as _; _]); + } + } + } + #[simd_test(enable = "amx-avx512,avx10.2")] fn test_tile_cvtrowps2phh() { unsafe { @@ -1073,6 +1210,28 @@ mod tests { } } + #[simd_test(enable = "amx-avx512,avx10.2")] + fn test_tile_cvtrowps2phhi() { + unsafe { + _init_amx(); + let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + for i in 0..16 { + let row = wrap_imm4!(_tile_cvtrowps2phhi::<0>, i); + assert_eq!( + *row.as_f16x32().as_array(), + array::from_fn(|j| if j & 1 == 0 { 0.0 } else { i as _ }) + ); + } + } + } + #[simd_test(enable = "amx-avx512,avx10.2")] fn test_tile_cvtrowps2phl() { unsafe { @@ -1095,6 +1254,28 @@ mod tests { } } + #[simd_test(enable = "amx-avx512,avx10.2")] + fn test_tile_cvtrowps2phli() { + unsafe { + _init_amx(); + let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + for i in 0..16 { + let row = wrap_imm4!(_tile_cvtrowps2phli::<0>, i); + assert_eq!( + *row.as_f16x32().as_array(), + array::from_fn(|j| if j & 1 == 0 { i as _ } else { 0.0 }) + ); + } + } + } + #[simd_test(enable = "amx-tf32")] fn test_tile_mmultf32ps() { unsafe { diff --git a/library/stdarch/crates/core_arch/src/x86_64/mod.rs b/library/stdarch/crates/core_arch/src/x86_64/mod.rs index 9caab44e46cd7..46384176e005e 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/mod.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/mod.rs @@ -81,3 +81,7 @@ pub use self::avx512fp16::*; mod amx; #[unstable(feature = "x86_amx_intrinsics", issue = "126622")] pub use self::amx::*; + +mod movrs; +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub use self::movrs::*; diff --git a/library/stdarch/crates/core_arch/src/x86_64/movrs.rs b/library/stdarch/crates/core_arch/src/x86_64/movrs.rs new file mode 100644 index 0000000000000..fc669bbb1ca59 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/x86_64/movrs.rs @@ -0,0 +1,94 @@ +//! Read-shared Move instructions + +#[cfg(test)] +use stdarch_test::assert_instr; + +unsafe extern "unadjusted" { + #[link_name = "llvm.x86.movrsqi"] + fn movrsqi(src: *const i8) -> i8; + #[link_name = "llvm.x86.movrshi"] + fn movrshi(src: *const i16) -> i16; + #[link_name = "llvm.x86.movrssi"] + fn movrssi(src: *const i32) -> i32; + #[link_name = "llvm.x86.movrsdi"] + fn movrsdi(src: *const i64) -> i64; +} + +/// Moves a byte from the source to the destination, with an indication that the source memory +/// location is likely to become read-shared by multiple processors, i.e., read in the future by at +/// least one other processor before it is written, assuming it is ever written in the future. +#[inline] +#[target_feature(enable = "movrs")] +#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))] +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub unsafe fn _movrs_i8(src: *const i8) -> i8 { + movrsqi(src) +} + +/// Moves a 16-bit word from the source to the destination, with an indication that the source memory +/// location is likely to become read-shared by multiple processors, i.e., read in the future by at +/// least one other processor before it is written, assuming it is ever written in the future. +#[inline] +#[target_feature(enable = "movrs")] +#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))] +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub unsafe fn _movrs_i16(src: *const i16) -> i16 { + movrshi(src) +} + +/// Moves a 32-bit doubleword from the source to the destination, with an indication that the source +/// memory location is likely to become read-shared by multiple processors, i.e., read in the future +/// by at least one other processor before it is written, assuming it is ever written in the future. +#[inline] +#[target_feature(enable = "movrs")] +#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))] +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub unsafe fn _movrs_i32(src: *const i32) -> i32 { + movrssi(src) +} + +/// Moves a 64-bit quadword from the source to the destination, with an indication that the source +/// memory location is likely to become read-shared by multiple processors, i.e., read in the future +/// by at least one other processor before it is written, assuming it is ever written in the future. +#[inline] +#[target_feature(enable = "movrs")] +#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))] +#[unstable(feature = "movrs_target_feature", issue = "137976")] +pub unsafe fn _movrs_i64(src: *const i64) -> i64 { + movrsdi(src) +} + +#[cfg(test)] +mod tests { + use stdarch_test::simd_test; + + use super::*; + + #[simd_test(enable = "movrs")] + fn test_movrs_i8() { + let x: i8 = 42; + let y = unsafe { _movrs_i8(&x) }; + assert_eq!(x, y); + } + + #[simd_test(enable = "movrs")] + fn test_movrs_i16() { + let x: i16 = 42; + let y = unsafe { _movrs_i16(&x) }; + assert_eq!(x, y); + } + + #[simd_test(enable = "movrs")] + fn test_movrs_i32() { + let x: i32 = 42; + let y = unsafe { _movrs_i32(&x) }; + assert_eq!(x, y); + } + + #[simd_test(enable = "movrs")] + fn test_movrs_i64() { + let x: i64 = 42; + let y = unsafe { _movrs_i64(&x) }; + assert_eq!(x, y); + } +} diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 4df899a202cfb..e88860717b6df 100644 --- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -6625,6 +6625,7 @@ intrinsics: arch: aarch64,arm64ec + - name: "vmaxnm{neon_type.no}" doc: Floating-point Maximum Number (vector) arguments: ["a: {neon_type}", "b: {neon_type}"] @@ -6636,7 +6637,11 @@ intrinsics: - float64x1_t - float64x2_t compose: - - FnCall: [simd_fmax, [a, b]] + - LLVMLink: + name: "fmaxnm.{neon_type}" + links: + - link: "llvm.aarch64.neon.fmaxnm.{neon_type}" + arch: aarch64,arm64ec - name: "vmaxnmh_{type}" @@ -6652,7 +6657,11 @@ intrinsics: types: - f16 compose: - - FnCall: ["f16::max", [a, b]] + - LLVMLink: + name: "vmaxh.{neon_type}" + links: + - link: "llvm.aarch64.neon.fmaxnm.{type}" + arch: aarch64,arm64ec - name: "vminnmh_{type}" @@ -6668,7 +6677,11 @@ intrinsics: types: - f16 compose: - - FnCall: ["f16::min", [a, b]] + - LLVMLink: + name: "vminh.{neon_type}" + links: + - link: "llvm.aarch64.neon.fminnm.{type}" + arch: aarch64,arm64ec - name: "vmaxnmv{neon_type[0].no}" @@ -6682,7 +6695,11 @@ intrinsics: - [float32x2_t, f32] - [float64x2_t, f64] compose: - - FnCall: [simd_reduce_max, [a]] + - LLVMLink: + name: "fmaxnmv.{neon_type[0]}" + links: + - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}" + arch: aarch64,arm64ec - name: "vmaxnmv{neon_type[0].no}" doc: Floating-point maximum number across vector @@ -6694,7 +6711,11 @@ intrinsics: types: - [float32x4_t, f32] compose: - - FnCall: [simd_reduce_max, [a]] + - LLVMLink: + name: "fmaxnmv.{neon_type[0]}" + links: + - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}" + arch: aarch64,arm64ec - name: "vmaxnmv{neon_type[0].no}" @@ -6711,7 +6732,11 @@ intrinsics: - [float16x4_t, f16] - [float16x8_t, f16] compose: - - FnCall: [simd_reduce_max, [a]] + - LLVMLink: + name: "fmaxnmv.{neon_type[0]}" + links: + - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}" + arch: aarch64,arm64ec - name: "vminnmv{neon_type[0].no}" @@ -6728,7 +6753,11 @@ intrinsics: - [float16x4_t, f16] - [float16x8_t, f16] compose: - - FnCall: [simd_reduce_min, [a]] + - LLVMLink: + name: "fminnmv.{neon_type[0]}" + links: + - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}" + arch: aarch64,arm64ec - name: "vmaxv{neon_type[0].no}" @@ -6837,7 +6866,11 @@ intrinsics: - float64x1_t - float64x2_t compose: - - FnCall: [simd_fmin, [a, b]] + - LLVMLink: + name: "fminnm.{neon_type}" + links: + - link: "llvm.aarch64.neon.fminnm.{neon_type}" + arch: aarch64,arm64ec - name: "vminnmv{neon_type[0].no}" doc: "Floating-point minimum number across vector" @@ -6851,7 +6884,11 @@ intrinsics: - [float32x2_t, "f32"] - [float64x2_t, "f64"] compose: - - FnCall: [simd_reduce_min, [a]] + - LLVMLink: + name: "vminnmv.{neon_type[0]}" + links: + - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}" + arch: aarch64,arm64ec - name: "vminnmv{neon_type[0].no}" doc: "Floating-point minimum number across vector" @@ -6864,7 +6901,11 @@ intrinsics: types: - [float32x4_t, "f32"] compose: - - FnCall: [simd_reduce_min, [a]] + - LLVMLink: + name: "vminnmv.{neon_type[0]}" + links: + - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}" + arch: aarch64,arm64ec - name: "vmovl_high{neon_type[0].noq}" doc: Vector move diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 5104ae607ccff..56b2252c9ef0c 100644 --- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -7324,7 +7324,13 @@ intrinsics: - float32x2_t - float32x4_t compose: - - FnCall: [simd_fmax, [a, b]] + - LLVMLink: + name: "fmaxnm.{neon_type}" + links: + - link: "llvm.arm.neon.vmaxnm.{neon_type}" + arch: arm + - link: "llvm.aarch64.neon.fmaxnm.{neon_type}" + arch: aarch64,arm64ec - name: "vmaxnm{neon_type.no}" @@ -7344,7 +7350,13 @@ intrinsics: - float16x4_t - float16x8_t compose: - - FnCall: [simd_fmax, [a, b]] + - LLVMLink: + name: "fmaxnm.{neon_type}" + links: + - link: "llvm.arm.neon.vmaxnm.{neon_type}" + arch: arm + - link: "llvm.aarch64.neon.fmaxnm.{neon_type}" + arch: aarch64,arm64ec - name: "vminnm{neon_type.no}" @@ -7364,7 +7376,13 @@ intrinsics: - float16x4_t - float16x8_t compose: - - FnCall: [simd_fmin, [a, b]] + - LLVMLink: + name: "fminnm.{neon_type}" + links: + - link: "llvm.arm.neon.vminnm.{neon_type}" + arch: arm + - link: "llvm.aarch64.neon.fminnm.{neon_type}" + arch: aarch64,arm64ec - name: "vmin{neon_type.no}" @@ -7477,7 +7495,13 @@ intrinsics: - float32x2_t - float32x4_t compose: - - FnCall: [simd_fmin, [a, b]] + - LLVMLink: + name: "fminnm.{neon_type}" + links: + - link: "llvm.arm.neon.vminnm.{neon_type}" + arch: arm + - link: "llvm.aarch64.neon.fminnm.{neon_type}" + arch: aarch64,arm64ec - name: "vpadd{neon_type.no}" doc: Floating-point add pairwise diff --git a/library/stdarch/crates/stdarch-verify/tests/arm.rs b/library/stdarch/crates/stdarch-verify/tests/arm.rs index 86897908e062c..3ef9ce2a38b69 100644 --- a/library/stdarch/crates/stdarch-verify/tests/arm.rs +++ b/library/stdarch/crates/stdarch-verify/tests/arm.rs @@ -445,6 +445,7 @@ fn verify_all_signatures() { && !rust.file.ends_with("v7.rs\"") && !rust.file.ends_with("v8.rs\"") && !rust.file.ends_with("mte.rs\"") + && !rust.file.ends_with("rand.rs\"") && !rust.file.ends_with("ex.rs\"") && !skip_intrinsic_verify.contains(&rust.name) { diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs index 2ac05e28cb4ce..7fc47be42e14b 100644 --- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs +++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs @@ -211,6 +211,7 @@ fn verify_all_signatures() { "_rdseed64_step", // Prefetch "_mm_prefetch", + "_m_prefetchrs", // CMPXCHG "cmpxchg16b", // Undefined @@ -305,7 +306,7 @@ fn verify_all_signatures() { } // FIXME: these have not been added to Intrinsics Guide yet - if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32"] + if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32", "movrs"] .iter() .any(|f| feature.contains(f)) { diff --git a/library/stdarch/examples/hex.rs b/library/stdarch/examples/hex.rs index 621f55bc0951f..21827b375adaf 100644 --- a/library/stdarch/examples/hex.rs +++ b/library/stdarch/examples/hex.rs @@ -13,7 +13,6 @@ //! and you should see `746573740a` get printed out. #![allow(internal_features)] -#![feature(wasm_target_feature)] #![cfg_attr(test, feature(test))] #![cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), diff --git a/library/stdarch/rust-version b/library/stdarch/rust-version index b22c6c3869c62..db9492636f6ac 100644 --- a/library/stdarch/rust-version +++ b/library/stdarch/rust-version @@ -1 +1 @@ -139651428df86cf88443295542c12ea617cbb587 +eda4fc7733ee89e484d7120cafbd80dcb2fce66e diff --git a/src/tools/miri/src/shims/x86/avx512.rs b/src/tools/miri/src/shims/x86/avx512.rs index b057a78b6c8ee..23538f0dea965 100644 --- a/src/tools/miri/src/shims/x86/avx512.rs +++ b/src/tools/miri/src/shims/x86/avx512.rs @@ -188,23 +188,26 @@ fn vpdpbusd<'tcx>( let (b, b_len) = ecx.project_to_simd(b)?; let (dest, dest_len) = ecx.project_to_simd(dest)?; - // fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; - // fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; - // fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + // fn vpdpbusd(src: i32x16, a: u8x64, b: i8x64) -> i32x16; + // fn vpdpbusd256(src: i32x8, a: u8x32, b: i8x32) -> i32x8; + // fn vpdpbusd128(src: i32x4, a: u8x16, b: i8x16) -> i32x4; assert_eq!(dest_len, src_len); - assert_eq!(dest_len, a_len); - assert_eq!(dest_len, b_len); + assert_eq!(dest_len * 4, a_len); + assert_eq!(a_len, b_len); for i in 0..dest_len { let src = ecx.read_scalar(&ecx.project_index(&src, i)?)?.to_i32()?; - let a = ecx.read_scalar(&ecx.project_index(&a, i)?)?.to_u32()?; - let b = ecx.read_scalar(&ecx.project_index(&b, i)?)?.to_u32()?; let dest = ecx.project_index(&dest, i)?; - let zipped = a.to_le_bytes().into_iter().zip(b.to_le_bytes()); - let intermediate_sum: i32 = zipped - .map(|(a, b)| i32::from(a).strict_mul(i32::from(b.cast_signed()))) - .fold(0, |x, y| x.strict_add(y)); + let mut intermediate_sum: i32 = 0; + for j in 0..4 { + let idx = i.strict_mul(4).strict_add(j); + let a = ecx.read_scalar(&ecx.project_index(&a, idx)?)?.to_u8()?; + let b = ecx.read_scalar(&ecx.project_index(&b, idx)?)?.to_i8()?; + + let product = i32::from(a).strict_mul(i32::from(b)); + intermediate_sum = intermediate_sum.strict_add(product); + } // Use `wrapping_add` because `src` is an arbitrary i32 and the addition can overflow. let res = Scalar::from_i32(intermediate_sum.wrapping_add(src));