From a24022ad4e98bfc5adc47cc114db57b68c8511d2 Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 2 Mar 2026 00:32:24 +0530
Subject: [PATCH 01/18] Fix LLVM intrinsic signatures for AVX-VNNI

---
 .../crates/core_arch/src/x86/avx512vnni.rs    | 160 +++++++++---------
 1 file changed, 80 insertions(+), 80 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
index 49b790b151049..8cd8764f24868 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
@@ -12,7 +12,7 @@ use stdarch_test::assert_instr;
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
 pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -51,7 +51,7 @@ pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m5
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
 pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -62,7 +62,7 @@ pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
 pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -101,7 +101,7 @@ pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m25
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
 pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -112,7 +112,7 @@ pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
 pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -151,7 +151,7 @@ pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
 pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -190,7 +190,7 @@ pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
 pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -201,7 +201,7 @@ pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
 pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -240,7 +240,7 @@ pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m2
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
 pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -251,7 +251,7 @@ pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
 pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -290,7 +290,7 @@ pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
 pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_u8x64(), b.as_i8x64())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -329,7 +329,7 @@ pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m5
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
 pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -340,7 +340,7 @@ pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
 pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -379,7 +379,7 @@ pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m25
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
 pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -390,7 +390,7 @@ pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
 pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -429,7 +429,7 @@ pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
 pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
+    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_u8x64(), b.as_i8x64())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -468,7 +468,7 @@ pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
 pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -479,7 +479,7 @@ pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
 pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_u8x32(), b.as_i8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -518,7 +518,7 @@ pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m2
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
 pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -529,7 +529,7 @@ pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
 pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_u8x16(), b.as_i8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -570,7 +570,7 @@ pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i
 #[cfg_attr(test, assert_instr(vpdpbssd))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -583,7 +583,7 @@ pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpbssd))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -596,7 +596,7 @@ pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpbssds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -609,7 +609,7 @@ pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpbssds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -622,7 +622,7 @@ pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpbsud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i8x16(), b.as_u8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -635,7 +635,7 @@ pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpbsud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i8x32(), b.as_u8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -648,7 +648,7 @@ pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpbsuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i8x16(), b.as_u8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -661,7 +661,7 @@ pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpbsuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i8x32(), b.as_u8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -674,7 +674,7 @@ pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpbuud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -687,7 +687,7 @@ pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpbuud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -700,7 +700,7 @@ pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpbuuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -713,7 +713,7 @@ pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpbuuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -726,7 +726,7 @@ pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpwsud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i16x8(), b.as_u16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -739,7 +739,7 @@ pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpwsud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i16x16(), b.as_u16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -752,7 +752,7 @@ pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpwsuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i16x8(), b.as_u16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -765,7 +765,7 @@ pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpwsuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i16x16(), b.as_u16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -778,7 +778,7 @@ pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpwusd))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_u16x8(), b.as_i16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -791,7 +791,7 @@ pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpwusd))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_u16x16(), b.as_i16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -804,7 +804,7 @@ pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpwusds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_u16x8(), b.as_i16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -817,7 +817,7 @@ pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpwusds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_u16x16(), b.as_i16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -830,7 +830,7 @@ pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpwuud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -843,7 +843,7 @@ pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpwuud))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -856,7 +856,7 @@ pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpdpwuuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -869,98 +869,98 @@ pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpdpwuuds))]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_u16x16(), b.as_u16x16())) }
 }
 
 #[allow(improper_ctypes)]
 unsafe extern "C" {
     #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
-    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    fn vpdpwssd(src: i32x16, a: i16x32, b: i16x32) -> i32x16;
     #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
-    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwssd256(src: i32x8, a: i16x16, b: i16x16) -> i32x8;
     #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
-    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwssd128(src: i32x4, a: i16x8, b: i16x8) -> i32x4;
 
     #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
-    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    fn vpdpwssds(src: i32x16, a: i16x32, b: i16x32) -> i32x16;
     #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
-    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwssds256(src: i32x8, a: i16x16, b: i16x16) -> i32x8;
     #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
-    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwssds128(src: i32x4, a: i16x8, b: i16x8) -> i32x4;
 
     #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
-    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    fn vpdpbusd(src: i32x16, a: u8x64, b: i8x64) -> i32x16;
     #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
-    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbusd256(src: i32x8, a: u8x32, b: i8x32) -> i32x8;
     #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
-    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbusd128(src: i32x4, a: u8x16, b: i8x16) -> i32x4;
 
     #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
-    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    fn vpdpbusds(src: i32x16, a: u8x64, b: i8x64) -> i32x16;
     #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
-    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbusds256(src: i32x8, a: u8x32, b: i8x32) -> i32x8;
     #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
-    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbusds128(src: i32x4, a: u8x16, b: i8x16) -> i32x4;
 
     #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
-    fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbssd_128(src: i32x4, a: i8x16, b: i8x16) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
-    fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbssd_256(src: i32x8, a: i8x32, b: i8x32) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
-    fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbssds_128(src: i32x4, a: i8x16, b: i8x16) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
-    fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbssds_256(src: i32x8, a: i8x32, b: i8x32) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
-    fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbsud_128(src: i32x4, a: i8x16, b: u8x16) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
-    fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbsud_256(src: i32x8, a: i8x32, b: u8x32) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
-    fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbsuds_128(src: i32x4, a: i8x16, b: u8x16) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
-    fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbsuds_256(src: i32x8, a: i8x32, b: u8x32) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
-    fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbuud_128(src: i32x4, a: u8x16, b: u8x16) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
-    fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbuud_256(src: i32x8, a: u8x32, b: u8x32) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
-    fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpbuuds_128(src: i32x4, a: u8x16, b: u8x16) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
-    fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpbuuds_256(src: i32x8, a: u8x32, b: u8x32) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
-    fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwsud_128(src: i32x4, a: i16x8, b: u16x8) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
-    fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwsud_256(src: i32x8, a: i16x16, b: u16x16) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
-    fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwsuds_128(src: i32x4, a: i16x8, b: u16x8) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
-    fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwsuds_256(src: i32x8, a: i16x16, b: u16x16) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
-    fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwusd_128(src: i32x4, a: u16x8, b: i16x8) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
-    fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwusd_256(src: i32x8, a: u16x16, b: i16x16) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
-    fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwusds_128(src: i32x4, a: u16x8, b: i16x8) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
-    fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwusds_256(src: i32x8, a: u16x16, b: i16x16) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
-    fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwuud_128(src: i32x4, a: u16x8, b: u16x8) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
-    fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwuud_256(src: i32x8, a: u16x16, b: u16x16) -> i32x8;
 
     #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
-    fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    fn vpdpwuuds_128(src: i32x4, a: u16x8, b: u16x8) -> i32x4;
     #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
-    fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    fn vpdpwuuds_256(src: i32x8, a: u16x16, b: u16x16) -> i32x8;
 }
 
 #[cfg(test)]

From 35f7c8bc53e41c983f8e565dce99e16ed1e4994a Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Thu, 5 Mar 2026 00:43:31 +0530
Subject: [PATCH 02/18] Correct stability attribute for avx512bw intrinsics

---
 .../crates/core_arch/src/x86/avx512bw.rs      | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index b41f8576cfe54..659d6c3be88e7 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -11753,7 +11753,7 @@ pub const fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
     unsafe {
         simd_cast::<_, i8x32>(simd_imax(
@@ -11771,7 +11771,7 @@ pub const fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
     unsafe {
         simd_select_bitmask(k, _mm512_cvtsepi16_epi8(a).as_i8x32(), src.as_i8x32()).as_m256i()
@@ -11785,7 +11785,7 @@ pub const fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i)
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
     unsafe { simd_select_bitmask(k, _mm512_cvtsepi16_epi8(a).as_i8x32(), i8x32::ZERO).as_m256i() }
 }
@@ -11797,7 +11797,7 @@ pub const fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
     unsafe {
         simd_cast::<_, i8x16>(simd_imax(
@@ -11815,7 +11815,7 @@ pub const fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
     unsafe {
         simd_select_bitmask(k, _mm256_cvtsepi16_epi8(a).as_i8x16(), src.as_i8x16()).as_m128i()
@@ -11829,7 +11829,7 @@ pub const fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
     unsafe { simd_select_bitmask(k, _mm256_cvtsepi16_epi8(a).as_i8x16(), i8x16::ZERO).as_m128i() }
 }
@@ -11874,7 +11874,7 @@ pub fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
     unsafe {
         simd_cast::<_, u8x32>(simd_imin(a.as_u16x32(), u16x32::splat(u8::MAX as _))).as_m256i()
@@ -11888,7 +11888,7 @@ pub const fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
     unsafe {
         simd_select_bitmask(k, _mm512_cvtusepi16_epi8(a).as_u8x32(), src.as_u8x32()).as_m256i()
@@ -11902,7 +11902,7 @@ pub const fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i)
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
     unsafe { simd_select_bitmask(k, _mm512_cvtusepi16_epi8(a).as_u8x32(), u8x32::ZERO).as_m256i() }
 }
@@ -11914,7 +11914,7 @@ pub const fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
     unsafe {
         simd_cast::<_, u8x16>(simd_imin(a.as_u16x16(), u16x16::splat(u8::MAX as _))).as_m128i()
@@ -11928,7 +11928,7 @@ pub const fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
     unsafe {
         simd_select_bitmask(k, _mm256_cvtusepi16_epi8(a).as_u8x16(), src.as_u8x16()).as_m128i()
@@ -11942,7 +11942,7 @@ pub const fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
     unsafe { simd_select_bitmask(k, _mm256_cvtusepi16_epi8(a).as_u8x16(), u8x16::ZERO).as_m128i() }
 }
@@ -12678,7 +12678,7 @@ pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
     let result = _mm512_cvtepi16_epi8(a).as_i8x32();
     let mask = simd_select_bitmask(k, i8x32::splat(!0), i8x32::ZERO);
@@ -12692,7 +12692,7 @@ pub const unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mma
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
     let result = _mm256_cvtepi16_epi8(a).as_i8x16();
     let mask = simd_select_bitmask(k, i8x16::splat(!0), i8x16::ZERO);
@@ -12706,7 +12706,7 @@ pub const unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mma
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-#[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
 pub const unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
     let result: i8x8 = simd_shuffle!(
         _mm_cvtepi16_epi8(a).as_i8x16(),

From 985cd2399ac618bc409b0401c9aac36688b8a580 Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 2 Mar 2026 00:27:37 +0530
Subject: [PATCH 03/18] Add immediate AMX intrinsics

---
 .../crates/core_arch/src/x86_64/amx.rs        | 181 ++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/library/stdarch/crates/core_arch/src/x86_64/amx.rs b/library/stdarch/crates/core_arch/src/x86_64/amx.rs
index 4e20e014cf20a..03bbe3e449258 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/amx.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/amx.rs
@@ -398,6 +398,22 @@ pub unsafe fn _tile_cvtrowd2ps<const TILE: i32>(row: u32) -> __m512 {
     tcvtrowd2ps(TILE as i8, row).as_m512()
 }
 
+/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer
+/// elements to packed single-precision (32-bit) floating-point elements.
+#[inline]
+#[rustc_legacy_const_generics(0, 1)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tcvtrowd2ps, TILE = 0, ROW = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cvtrowd2psi<const TILE: i32, const ROW: i32>() -> __m512 {
+    static_assert_uimm_bits!(TILE, 3);
+    static_assert_uimm_bits!(ROW, 6);
+    tcvtrowd2psi(TILE as i8, ROW as u32).as_m512()
+}
+
 /// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
 /// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
 /// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
@@ -414,6 +430,23 @@ pub unsafe fn _tile_cvtrowps2phh<const TILE: i32>(row: u32) -> __m512h {
     tcvtrowps2phh(TILE as i8, row).as_m512h()
 }
 
+/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
+/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
+/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
+#[inline]
+#[rustc_legacy_const_generics(0, 1)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tcvtrowps2phh, TILE = 0, ROW = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cvtrowps2phhi<const TILE: i32, const ROW: i32>() -> __m512h {
+    static_assert_uimm_bits!(TILE, 3);
+    static_assert_uimm_bits!(ROW, 6);
+    tcvtrowps2phhi(TILE as i8, ROW as u32).as_m512h()
+}
+
 /// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
 /// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
 /// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
@@ -430,6 +463,23 @@ pub unsafe fn _tile_cvtrowps2phl<const TILE: i32>(row: u32) -> __m512h {
     tcvtrowps2phl(TILE as i8, row).as_m512h()
 }
 
+/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
+/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
+/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
+#[inline]
+#[rustc_legacy_const_generics(0, 1)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tcvtrowps2phl, TILE = 0, ROW = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_cvtrowps2phli<const TILE: i32, const ROW: i32>() -> __m512h {
+    static_assert_uimm_bits!(TILE, 3);
+    static_assert_uimm_bits!(ROW, 6);
+    tcvtrowps2phli(TILE as i8, ROW as u32).as_m512h()
+}
+
 /// Moves one row of tile data into a zmm vector register
 #[inline]
 #[rustc_legacy_const_generics(0)]
@@ -444,6 +494,21 @@ pub unsafe fn _tile_movrow<const TILE: i32>(row: u32) -> __m512i {
     tilemovrow(TILE as i8, row).as_m512i()
 }
 
+/// Moves one row of tile data into a zmm vector register
+#[inline]
+#[rustc_legacy_const_generics(0, 1)]
+#[target_feature(enable = "amx-avx512,avx10.2")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(tilemovrow, TILE = 0, ROW = 0)
+)]
+#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
+pub unsafe fn _tile_movrowi<const TILE: i32, const ROW: i32>() -> __m512i {
+    static_assert_uimm_bits!(TILE, 3);
+    static_assert_uimm_bits!(ROW, 6);
+    tilemovrowi(TILE as i8, ROW as u32).as_m512i()
+}
+
 #[allow(improper_ctypes)]
 unsafe extern "C" {
     #[link_name = "llvm.x86.ldtilecfg"]
@@ -492,12 +557,20 @@ unsafe extern "C" {
     fn tmmultf32ps(dst: i8, a: i8, b: i8);
     #[link_name = "llvm.x86.tcvtrowd2ps"]
     fn tcvtrowd2ps(tile: i8, row: u32) -> f32x16;
+    #[link_name = "llvm.x86.tcvtrowd2psi"]
+    fn tcvtrowd2psi(tile: i8, row: u32) -> f32x16;
     #[link_name = "llvm.x86.tcvtrowps2phh"]
     fn tcvtrowps2phh(tile: i8, row: u32) -> f16x32;
+    #[link_name = "llvm.x86.tcvtrowps2phhi"]
+    fn tcvtrowps2phhi(tile: i8, row: u32) -> f16x32;
     #[link_name = "llvm.x86.tcvtrowps2phl"]
     fn tcvtrowps2phl(tile: i8, row: u32) -> f16x32;
+    #[link_name = "llvm.x86.tcvtrowps2phli"]
+    fn tcvtrowps2phli(tile: i8, row: u32) -> f16x32;
     #[link_name = "llvm.x86.tilemovrow"]
     fn tilemovrow(tile: i8, row: u32) -> i32x16;
+    #[link_name = "llvm.x86.tilemovrowi"]
+    fn tilemovrowi(tile: i8, row: u32) -> i32x16;
 }
 
 #[cfg(test)]
@@ -1032,6 +1105,50 @@ mod tests {
         }
     }
 
+    macro_rules! wrap_imm4 {
+        ($name:ident :: <$TILE:literal>, $row:expr) => {
+            match $row {
+                0 => $name::<$TILE, 0>(),
+                1 => $name::<$TILE, 1>(),
+                2 => $name::<$TILE, 2>(),
+                3 => $name::<$TILE, 3>(),
+                4 => $name::<$TILE, 4>(),
+                5 => $name::<$TILE, 5>(),
+                6 => $name::<$TILE, 6>(),
+                7 => $name::<$TILE, 7>(),
+                8 => $name::<$TILE, 8>(),
+                9 => $name::<$TILE, 9>(),
+                10 => $name::<$TILE, 10>(),
+                11 => $name::<$TILE, 11>(),
+                12 => $name::<$TILE, 12>(),
+                13 => $name::<$TILE, 13>(),
+                14 => $name::<$TILE, 14>(),
+                15 => $name::<$TILE, 15>(),
+                _ => panic!("row index out of range"),
+            }
+        };
+    }
+
+    #[simd_test(enable = "amx-avx512,avx10.2")]
+    fn test_tile_movrowi() {
+        unsafe {
+            _init_amx();
+            let array: [[u8; 64]; 16] = array::from_fn(|i| [i as _; _]);
+
+            let mut config = __tilecfg::default();
+            config.palette = 1;
+            config.colsb[0] = 64;
+            config.rows[0] = 16;
+            _tile_loadconfig(config.as_ptr());
+            _tile_loadd::<0>(array.as_ptr().cast(), 64);
+
+            for i in 0..16 {
+                let row = wrap_imm4!(_tile_movrowi::<0>, i);
+                assert_eq!(*row.as_u8x64().as_array(), [i as _; _]);
+            }
+        }
+    }
+
     #[simd_test(enable = "amx-avx512,avx10.2")]
     fn test_tile_cvtrowd2ps() {
         unsafe {
@@ -1051,6 +1168,26 @@ mod tests {
         }
     }
 
+    #[simd_test(enable = "amx-avx512,avx10.2")]
+    fn test_tile_cvtrowd2psi() {
+        unsafe {
+            _init_amx();
+            let array: [[u32; 16]; 16] = array::from_fn(|i| [i as _; _]);
+
+            let mut config = __tilecfg::default();
+            config.palette = 1;
+            config.colsb[0] = 64;
+            config.rows[0] = 16;
+            _tile_loadconfig(config.as_ptr());
+            _tile_loadd::<0>(array.as_ptr().cast(), 64);
+
+            for i in 0..16 {
+                let row = wrap_imm4!(_tile_cvtrowd2psi::<0>, i);
+                assert_eq!(*row.as_f32x16().as_array(), [i as _; _]);
+            }
+        }
+    }
+
     #[simd_test(enable = "amx-avx512,avx10.2")]
     fn test_tile_cvtrowps2phh() {
         unsafe {
@@ -1073,6 +1210,28 @@ mod tests {
         }
     }
 
+    #[simd_test(enable = "amx-avx512,avx10.2")]
+    fn test_tile_cvtrowps2phhi() {
+        unsafe {
+            _init_amx();
+            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
+
+            let mut config = __tilecfg::default();
+            config.palette = 1;
+            config.colsb[0] = 64;
+            config.rows[0] = 16;
+            _tile_loadconfig(config.as_ptr());
+            _tile_loadd::<0>(array.as_ptr().cast(), 64);
+            for i in 0..16 {
+                let row = wrap_imm4!(_tile_cvtrowps2phhi::<0>, i);
+                assert_eq!(
+                    *row.as_f16x32().as_array(),
+                    array::from_fn(|j| if j & 1 == 0 { 0.0 } else { i as _ })
+                );
+            }
+        }
+    }
+
     #[simd_test(enable = "amx-avx512,avx10.2")]
     fn test_tile_cvtrowps2phl() {
         unsafe {
@@ -1095,6 +1254,28 @@ mod tests {
         }
     }
 
+    #[simd_test(enable = "amx-avx512,avx10.2")]
+    fn test_tile_cvtrowps2phli() {
+        unsafe {
+            _init_amx();
+            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
+
+            let mut config = __tilecfg::default();
+            config.palette = 1;
+            config.colsb[0] = 64;
+            config.rows[0] = 16;
+            _tile_loadconfig(config.as_ptr());
+            _tile_loadd::<0>(array.as_ptr().cast(), 64);
+            for i in 0..16 {
+                let row = wrap_imm4!(_tile_cvtrowps2phli::<0>, i);
+                assert_eq!(
+                    *row.as_f16x32().as_array(),
+                    array::from_fn(|j| if j & 1 == 0 { i as _ } else { 0.0 })
+                );
+            }
+        }
+    }
+
     #[simd_test(enable = "amx-tf32")]
     fn test_tile_mmultf32ps() {
         unsafe {

From 48d573f8e5c5dc19c68d82b08af90275508bf65c Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Thu, 5 Mar 2026 11:32:59 +0530
Subject: [PATCH 04/18] Add movrs intrinsics

---
 library/stdarch/crates/core_arch/src/lib.rs   |  3 +-
 .../stdarch/crates/core_arch/src/x86/mod.rs   |  4 +
 .../stdarch/crates/core_arch/src/x86/movrs.rs | 23 +++++
 .../crates/core_arch/src/x86_64/mod.rs        |  4 +
 .../crates/core_arch/src/x86_64/movrs.rs      | 94 +++++++++++++++++++
 .../crates/stdarch-verify/tests/x86-intel.rs  |  3 +-
 6 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 library/stdarch/crates/core_arch/src/x86/movrs.rs
 create mode 100644 library/stdarch/crates/core_arch/src/x86_64/movrs.rs

diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs
index 8a1bead7c4791..9255994e5ee81 100644
--- a/library/stdarch/crates/core_arch/src/lib.rs
+++ b/library/stdarch/crates/core_arch/src/lib.rs
@@ -39,7 +39,8 @@
     const_trait_impl,
     const_cmp,
     const_eval_select,
-    maybe_uninit_as_bytes
+    maybe_uninit_as_bytes,
+    movrs_target_feature
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))]
 #![deny(clippy::missing_inline_in_public_items)]
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
index 9396507f08045..68a963f65b7d4 100644
--- a/library/stdarch/crates/core_arch/src/x86/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -774,3 +774,7 @@ pub use self::avx512fp16::*;
 mod kl;
 #[stable(feature = "keylocker_x86", since = "1.89.0")]
 pub use self::kl::*;
+
+mod movrs;
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub use self::movrs::*;
diff --git a/library/stdarch/crates/core_arch/src/x86/movrs.rs b/library/stdarch/crates/core_arch/src/x86/movrs.rs
new file mode 100644
index 0000000000000..d5f4d146c44aa
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/movrs.rs
@@ -0,0 +1,23 @@
+//! Read-shared move intrinsics
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.prefetchrs"]
+    fn prefetchrs(p: *const u8);
+}
+
+/// Prefetches the cache line that contains address `p`, with an indication that the source memory
+/// location is likely to become read-shared by multiple processors, i.e., read in the future by at
+/// least one other processor before it is written, assuming it is ever written in the future.
+///
+/// Note: this intrinsic is safe to use even though it takes a raw pointer argument. In general, this
+/// cannot change the behavior of the program, including not trapping on invalid pointers.
+#[inline]
+#[target_feature(enable = "movrs")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(prefetchrst2))]
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub fn _m_prefetchrs(p: *const u8) {
+    unsafe { prefetchrs(p) }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86_64/mod.rs b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
index 9caab44e46cd7..46384176e005e 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/mod.rs
@@ -81,3 +81,7 @@ pub use self::avx512fp16::*;
 mod amx;
 #[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
 pub use self::amx::*;
+
+mod movrs;
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub use self::movrs::*;
diff --git a/library/stdarch/crates/core_arch/src/x86_64/movrs.rs b/library/stdarch/crates/core_arch/src/x86_64/movrs.rs
new file mode 100644
index 0000000000000..fc669bbb1ca59
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86_64/movrs.rs
@@ -0,0 +1,94 @@
+//! Read-shared Move instructions
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.movrsqi"]
+    fn movrsqi(src: *const i8) -> i8;
+    #[link_name = "llvm.x86.movrshi"]
+    fn movrshi(src: *const i16) -> i16;
+    #[link_name = "llvm.x86.movrssi"]
+    fn movrssi(src: *const i32) -> i32;
+    #[link_name = "llvm.x86.movrsdi"]
+    fn movrsdi(src: *const i64) -> i64;
+}
+
+/// Moves a byte from the source to the destination, with an indication that the source memory
+/// location is likely to become read-shared by multiple processors, i.e., read in the future by at
+/// least one other processor before it is written, assuming it is ever written in the future.
+#[inline]
+#[target_feature(enable = "movrs")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))]
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub unsafe fn _movrs_i8(src: *const i8) -> i8 {
+    movrsqi(src)
+}
+
+/// Moves a 16-bit word from the source to the destination, with an indication that the source memory
+/// location is likely to become read-shared by multiple processors, i.e., read in the future by at
+/// least one other processor before it is written, assuming it is ever written in the future.
+#[inline]
+#[target_feature(enable = "movrs")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))]
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub unsafe fn _movrs_i16(src: *const i16) -> i16 {
+    movrshi(src)
+}
+
+/// Moves a 32-bit doubleword from the source to the destination, with an indication that the source
+/// memory location is likely to become read-shared by multiple processors, i.e., read in the future
+/// by at least one other processor before it is written, assuming it is ever written in the future.
+#[inline]
+#[target_feature(enable = "movrs")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))]
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub unsafe fn _movrs_i32(src: *const i32) -> i32 {
+    movrssi(src)
+}
+
+/// Moves a 64-bit quadword from the source to the destination, with an indication that the source
+/// memory location is likely to become read-shared by multiple processors, i.e., read in the future
+/// by at least one other processor before it is written, assuming it is ever written in the future.
+#[inline]
+#[target_feature(enable = "movrs")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(movrs))]
+#[unstable(feature = "movrs_target_feature", issue = "137976")]
+pub unsafe fn _movrs_i64(src: *const i64) -> i64 {
+    movrsdi(src)
+}
+
+#[cfg(test)]
+mod tests {
+    use stdarch_test::simd_test;
+
+    use super::*;
+
+    #[simd_test(enable = "movrs")]
+    fn test_movrs_i8() {
+        let x: i8 = 42;
+        let y = unsafe { _movrs_i8(&x) };
+        assert_eq!(x, y);
+    }
+
+    #[simd_test(enable = "movrs")]
+    fn test_movrs_i16() {
+        let x: i16 = 42;
+        let y = unsafe { _movrs_i16(&x) };
+        assert_eq!(x, y);
+    }
+
+    #[simd_test(enable = "movrs")]
+    fn test_movrs_i32() {
+        let x: i32 = 42;
+        let y = unsafe { _movrs_i32(&x) };
+        assert_eq!(x, y);
+    }
+
+    #[simd_test(enable = "movrs")]
+    fn test_movrs_i64() {
+        let x: i64 = 42;
+        let y = unsafe { _movrs_i64(&x) };
+        assert_eq!(x, y);
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
index 2ac05e28cb4ce..7fc47be42e14b 100644
--- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@@ -211,6 +211,7 @@ fn verify_all_signatures() {
                 "_rdseed64_step",
                 // Prefetch
                 "_mm_prefetch",
+                "_m_prefetchrs",
                 // CMPXCHG
                 "cmpxchg16b",
                 // Undefined
@@ -305,7 +306,7 @@ fn verify_all_signatures() {
             }
 
             // FIXME: these have not been added to Intrinsics Guide yet
-            if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32"]
+            if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32", "movrs"]
                 .iter()
                 .any(|f| feature.contains(f))
             {

From 6abb95cc00bf58dc6f9ce66c4c27934385f370bd Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Mar 2026 13:15:28 +0100
Subject: [PATCH 05/18] gate use of `wasm_target_feature` on wasm target arch

---
 library/stdarch/examples/hex.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/stdarch/examples/hex.rs b/library/stdarch/examples/hex.rs
index 621f55bc0951f..21827b375adaf 100644
--- a/library/stdarch/examples/hex.rs
+++ b/library/stdarch/examples/hex.rs
@@ -13,7 +13,6 @@
 //! and you should see `746573740a` get printed out.
 
 #![allow(internal_features)]
-#![feature(wasm_target_feature)]
 #![cfg_attr(test, feature(test))]
 #![cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),

From 6c7d342e10d83a33da19da621fb2b5b34a875729 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Mar 2026 21:26:59 +0100
Subject: [PATCH 06/18] add ACLE random number generation intrinsics

---
 .../crates/core_arch/src/aarch64/mod.rs       |  4 ++
 .../crates/core_arch/src/aarch64/rand.rs      | 69 +++++++++++++++++++
 .../crates/stdarch-verify/tests/arm.rs        |  1 +
 3 files changed, 74 insertions(+)
 create mode 100644 library/stdarch/crates/core_arch/src/aarch64/rand.rs

diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
index b48bdac57e7db..d7295659c3c9a 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
@@ -17,6 +17,10 @@ mod mte;
 #[unstable(feature = "stdarch_aarch64_mte", issue = "129010")]
 pub use self::mte::*;
 
+mod rand;
+#[unstable(feature = "stdarch_aarch64_rand", issue = "153514")]
+pub use self::rand::*;
+
 mod neon;
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub use self::neon::*;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/rand.rs b/library/stdarch/crates/core_arch/src/aarch64/rand.rs
new file mode 100644
index 0000000000000..5492fd014401a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/aarch64/rand.rs
@@ -0,0 +1,69 @@
+//! AArch64 Random Number intrinsics
+//!
+//! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#random-number-generation-intrinsics)
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.rndr"
+    )]
+    fn rndr_() -> Tuple;
+
+    #[cfg_attr(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        link_name = "llvm.aarch64.rndrrs"
+    )]
+    fn rndrrs_() -> Tuple;
+}
+
+#[repr(C)]
+struct Tuple {
+    bits: u64,
+    status: bool,
+}
+
+/// Stores a 64-bit random number into the object pointed to by the argument and returns
+/// zero. If the implementation could not generate a random number within a reasonable
+/// period of time the object pointed to by the input is set to zero and a non-zero value
+/// is returned.
+#[inline]
+#[target_feature(enable = "rand")]
+#[unstable(feature = "stdarch_aarch64_rand", issue = "153514")]
+pub unsafe fn __rndr(value: *mut u64) -> i32 {
+    let Tuple { bits, status } = rndr_();
+    unsafe { *value = bits };
+    status as i32
+}
+
+/// Reseeds the random number generator. After that stores a 64-bit random number into
+/// the object pointed to by the argument and returns zero. If the implementation could
+/// not generate a random number within a reasonable period of time the object pointed
+/// to by the input is set to zero and a non-zero value is returned.
+#[inline]
+#[target_feature(enable = "rand")]
+#[unstable(feature = "stdarch_aarch64_rand", issue = "153514")]
+pub unsafe fn __rndrrs(value: *mut u64) -> i32 {
+    let Tuple { bits, status } = rndrrs_();
+    unsafe { *value = bits };
+    status as i32
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use stdarch_test::assert_instr;
+
+    #[cfg_attr(test, assert_instr(mrs))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "rand")]
+    unsafe fn test_rndr(value: &mut u64) -> i32 {
+        __rndr(value)
+    }
+
+    #[cfg_attr(test, assert_instr(mrs))]
+    #[allow(dead_code)]
+    #[target_feature(enable = "rand")]
+    unsafe fn test_rndrrs(value: &mut u64) -> i32 {
+        __rndrrs(value)
+    }
+}
diff --git a/library/stdarch/crates/stdarch-verify/tests/arm.rs b/library/stdarch/crates/stdarch-verify/tests/arm.rs
index 86897908e062c..3ef9ce2a38b69 100644
--- a/library/stdarch/crates/stdarch-verify/tests/arm.rs
+++ b/library/stdarch/crates/stdarch-verify/tests/arm.rs
@@ -445,6 +445,7 @@ fn verify_all_signatures() {
                     && !rust.file.ends_with("v7.rs\"")
                     && !rust.file.ends_with("v8.rs\"")
                     && !rust.file.ends_with("mte.rs\"")
+                    && !rust.file.ends_with("rand.rs\"")
                     && !rust.file.ends_with("ex.rs\"")
                     && !skip_intrinsic_verify.contains(&rust.name)
                 {

From 279c43e72724405c1a60db7d17e38c1096e3e04e Mon Sep 17 00:00:00 2001
From: The rustc-josh-sync Cronjob Bot <github-actions@github.com>
Date: Mon, 9 Mar 2026 04:42:45 +0000
Subject: [PATCH 07/18] Prepare for merging from rust-lang/rust

This updates the rust-version file to eda4fc7733ee89e484d7120cafbd80dcb2fce66e.
---
 library/stdarch/rust-version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/stdarch/rust-version b/library/stdarch/rust-version
index b22c6c3869c62..db9492636f6ac 100644
--- a/library/stdarch/rust-version
+++ b/library/stdarch/rust-version
@@ -1 +1 @@
-139651428df86cf88443295542c12ea617cbb587
+eda4fc7733ee89e484d7120cafbd80dcb2fce66e

From f1bd64704781317a8c37f183ba212aa6fc1ea5a7 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 10 Mar 2026 12:19:33 +0100
Subject: [PATCH 08/18] Revert "Merge pull request #1871 from
 folkertdev/aarch64-float-min-max"

This reverts commit 6a8a764262df5e65c06bc5d9180046a636a53ce9, reversing
changes made to a37563b5f8cec0c873864b786c33d00386189916.
---
 .../core_arch/src/aarch64/neon/generated.rs   | 360 ++++++++++++++++--
 .../src/arm_shared/neon/generated.rs          |  80 +++-
 .../spec/neon/aarch64.spec.yml                |  85 ++++-
 .../spec/neon/arm_shared.spec.yml             |  32 +-
 4 files changed, 491 insertions(+), 66 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
index de64839661d6e..c4968a68c42f4 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -13532,7 +13532,14 @@ pub fn vmaxh_f16(a: f16, b: f16) -> f16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fmaxnm))]
 pub fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
-    unsafe { simd_fmax(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v1f64"
+        )]
+        fn _vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vmaxnm_f64(a, b) }
 }
 #[doc = "Floating-point Maximum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f64)"]
@@ -13541,7 +13548,14 @@ pub fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fmaxnm))]
 pub fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    unsafe { simd_fmax(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v2f64"
+        )]
+        fn _vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vmaxnmq_f64(a, b) }
 }
 #[doc = "Floating-point Maximum Number"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmh_f16)"]
@@ -13551,7 +13565,14 @@ pub fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(fmaxnm))]
 pub fn vmaxnmh_f16(a: f16, b: f16) -> f16 {
-    f16::max(a, b)
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.f16"
+        )]
+        fn _vmaxnmh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vmaxnmh_f16(a, b) }
 }
 #[doc = "Floating-point maximum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmv_f16)"]
@@ -13561,7 +13582,14 @@ pub fn vmaxnmh_f16(a: f16, b: f16) -> f16 {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(fmaxnmv))]
 pub fn vmaxnmv_f16(a: float16x4_t) -> f16 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f16.v4f16"
+        )]
+        fn _vmaxnmv_f16(a: float16x4_t) -> f16;
+    }
+    unsafe { _vmaxnmv_f16(a) }
 }
 #[doc = "Floating-point maximum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f16)"]
@@ -13571,7 +13599,14 @@ pub fn vmaxnmv_f16(a: float16x4_t) -> f16 {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(fmaxnmv))]
 pub fn vmaxnmvq_f16(a: float16x8_t) -> f16 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f16.v8f16"
+        )]
+        fn _vmaxnmvq_f16(a: float16x8_t) -> f16;
+    }
+    unsafe { _vmaxnmvq_f16(a) }
 }
 #[doc = "Floating-point maximum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmv_f32)"]
@@ -13580,7 +13615,14 @@ pub fn vmaxnmvq_f16(a: float16x8_t) -> f16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fmaxnmp))]
 pub fn vmaxnmv_f32(a: float32x2_t) -> f32 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f32.v2f32"
+        )]
+        fn _vmaxnmv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vmaxnmv_f32(a) }
 }
 #[doc = "Floating-point maximum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f64)"]
@@ -13589,7 +13631,14 @@ pub fn vmaxnmv_f32(a: float32x2_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fmaxnmp))]
 pub fn vmaxnmvq_f64(a: float64x2_t) -> f64 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f64.v2f64"
+        )]
+        fn _vmaxnmvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vmaxnmvq_f64(a) }
 }
 #[doc = "Floating-point maximum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmvq_f32)"]
@@ -13598,7 +13647,14 @@ pub fn vmaxnmvq_f64(a: float64x2_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fmaxnmv))]
 pub fn vmaxnmvq_f32(a: float32x4_t) -> f32 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnmv.f32.v4f32"
+        )]
+        fn _vmaxnmvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vmaxnmvq_f32(a) }
 }
 #[doc = "Floating-point maximum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_f16)"]
@@ -13689,7 +13745,14 @@ pub fn vmaxvq_f64(a: float64x2_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxv_s8(a: int8x8_t) -> i8 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"
+        )]
+        fn _vmaxv_s8(a: int8x8_t) -> i8;
+    }
+    unsafe { _vmaxv_s8(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s8)"]
@@ -13698,7 +13761,14 @@ pub fn vmaxv_s8(a: int8x8_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxvq_s8(a: int8x16_t) -> i8 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i8.v16i8"
+        )]
+        fn _vmaxvq_s8(a: int8x16_t) -> i8;
+    }
+    unsafe { _vmaxvq_s8(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s16)"]
@@ -13707,7 +13777,14 @@ pub fn vmaxvq_s8(a: int8x16_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxv_s16(a: int16x4_t) -> i16 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"
+        )]
+        fn _vmaxv_s16(a: int16x4_t) -> i16;
+    }
+    unsafe { _vmaxv_s16(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s16)"]
@@ -13716,7 +13793,14 @@ pub fn vmaxv_s16(a: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxvq_s16(a: int16x8_t) -> i16 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"
+        )]
+        fn _vmaxvq_s16(a: int16x8_t) -> i16;
+    }
+    unsafe { _vmaxvq_s16(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s32)"]
@@ -13725,7 +13809,14 @@ pub fn vmaxvq_s16(a: int16x8_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxp))]
 pub fn vmaxv_s32(a: int32x2_t) -> i32 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"
+        )]
+        fn _vmaxv_s32(a: int32x2_t) -> i32;
+    }
+    unsafe { _vmaxv_s32(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s32)"]
@@ -13734,7 +13825,14 @@ pub fn vmaxv_s32(a: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxvq_s32(a: int32x4_t) -> i32 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"
+        )]
+        fn _vmaxvq_s32(a: int32x4_t) -> i32;
+    }
+    unsafe { _vmaxvq_s32(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u8)"]
@@ -13743,7 +13841,14 @@ pub fn vmaxvq_s32(a: int32x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxv_u8(a: uint8x8_t) -> u8 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"
+        )]
+        fn _vmaxv_u8(a: uint8x8_t) -> u8;
+    }
+    unsafe { _vmaxv_u8(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u8)"]
@@ -13752,7 +13857,14 @@ pub fn vmaxv_u8(a: uint8x8_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxvq_u8(a: uint8x16_t) -> u8 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i8.v16i8"
+        )]
+        fn _vmaxvq_u8(a: uint8x16_t) -> u8;
+    }
+    unsafe { _vmaxvq_u8(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u16)"]
@@ -13761,7 +13873,14 @@ pub fn vmaxvq_u8(a: uint8x16_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxv_u16(a: uint16x4_t) -> u16 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"
+        )]
+        fn _vmaxv_u16(a: uint16x4_t) -> u16;
+    }
+    unsafe { _vmaxv_u16(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u16)"]
@@ -13770,7 +13889,14 @@ pub fn vmaxv_u16(a: uint16x4_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxvq_u16(a: uint16x8_t) -> u16 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"
+        )]
+        fn _vmaxvq_u16(a: uint16x8_t) -> u16;
+    }
+    unsafe { _vmaxvq_u16(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u32)"]
@@ -13779,7 +13905,14 @@ pub fn vmaxvq_u16(a: uint16x8_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxp))]
 pub fn vmaxv_u32(a: uint32x2_t) -> u32 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"
+        )]
+        fn _vmaxv_u32(a: uint32x2_t) -> u32;
+    }
+    unsafe { _vmaxv_u32(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32)"]
@@ -13788,7 +13921,14 @@ pub fn vmaxv_u32(a: uint32x2_t) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxvq_u32(a: uint32x4_t) -> u32 {
-    unsafe { simd_reduce_max(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"
+        )]
+        fn _vmaxvq_u32(a: uint32x4_t) -> u32;
+    }
+    unsafe { _vmaxvq_u32(a) }
 }
 #[doc = "Minimum (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f64)"]
@@ -13846,7 +13986,14 @@ pub fn vminh_f16(a: f16, b: f16) -> f16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fminnm))]
 pub fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
-    unsafe { simd_fmin(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v1f64"
+        )]
+        fn _vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    unsafe { _vminnm_f64(a, b) }
 }
 #[doc = "Floating-point Minimum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f64)"]
@@ -13855,7 +14002,14 @@ pub fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(fminnm))]
 pub fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    unsafe { simd_fmin(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v2f64"
+        )]
+        fn _vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    unsafe { _vminnmq_f64(a, b) }
 }
 #[doc = "Floating-point Minimum Number"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmh_f16)"]
@@ -13865,7 +14019,14 @@ pub fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(fminnm))]
 pub fn vminnmh_f16(a: f16, b: f16) -> f16 {
-    f16::min(a, b)
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.f16"
+        )]
+        fn _vminnmh_f16(a: f16, b: f16) -> f16;
+    }
+    unsafe { _vminnmh_f16(a, b) }
 }
 #[doc = "Floating-point minimum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmv_f16)"]
@@ -13875,7 +14036,14 @@ pub fn vminnmh_f16(a: f16, b: f16) -> f16 {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(fminnmv))]
 pub fn vminnmv_f16(a: float16x4_t) -> f16 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f16.v4f16"
+        )]
+        fn _vminnmv_f16(a: float16x4_t) -> f16;
+    }
+    unsafe { _vminnmv_f16(a) }
 }
 #[doc = "Floating-point minimum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f16)"]
@@ -13885,7 +14053,14 @@ pub fn vminnmv_f16(a: float16x4_t) -> f16 {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(fminnmv))]
 pub fn vminnmvq_f16(a: float16x8_t) -> f16 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f16.v8f16"
+        )]
+        fn _vminnmvq_f16(a: float16x8_t) -> f16;
+    }
+    unsafe { _vminnmvq_f16(a) }
 }
 #[doc = "Floating-point minimum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmv_f32)"]
@@ -13894,7 +14069,14 @@ pub fn vminnmvq_f16(a: float16x8_t) -> f16 {
 #[cfg_attr(test, assert_instr(fminnmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vminnmv_f32(a: float32x2_t) -> f32 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f32.v2f32"
+        )]
+        fn _vminnmv_f32(a: float32x2_t) -> f32;
+    }
+    unsafe { _vminnmv_f32(a) }
 }
 #[doc = "Floating-point minimum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f64)"]
@@ -13903,7 +14085,14 @@ pub fn vminnmv_f32(a: float32x2_t) -> f32 {
 #[cfg_attr(test, assert_instr(fminnmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vminnmvq_f64(a: float64x2_t) -> f64 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f64.v2f64"
+        )]
+        fn _vminnmvq_f64(a: float64x2_t) -> f64;
+    }
+    unsafe { _vminnmvq_f64(a) }
 }
 #[doc = "Floating-point minimum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmvq_f32)"]
@@ -13912,7 +14101,14 @@ pub fn vminnmvq_f64(a: float64x2_t) -> f64 {
 #[cfg_attr(test, assert_instr(fminnmv))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vminnmvq_f32(a: float32x4_t) -> f32 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnmv.f32.v4f32"
+        )]
+        fn _vminnmvq_f32(a: float32x4_t) -> f32;
+    }
+    unsafe { _vminnmvq_f32(a) }
 }
 #[doc = "Floating-point minimum number across vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_f16)"]
@@ -14003,7 +14199,14 @@ pub fn vminvq_f64(a: float64x2_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminv_s8(a: int8x8_t) -> i8 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i8.v8i8"
+        )]
+        fn _vminv_s8(a: int8x8_t) -> i8;
+    }
+    unsafe { _vminv_s8(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s8)"]
@@ -14012,7 +14215,14 @@ pub fn vminv_s8(a: int8x8_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminvq_s8(a: int8x16_t) -> i8 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i8.v16i8"
+        )]
+        fn _vminvq_s8(a: int8x16_t) -> i8;
+    }
+    unsafe { _vminvq_s8(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s16)"]
@@ -14021,7 +14231,14 @@ pub fn vminvq_s8(a: int8x16_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminv_s16(a: int16x4_t) -> i16 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i16.v4i16"
+        )]
+        fn _vminv_s16(a: int16x4_t) -> i16;
+    }
+    unsafe { _vminv_s16(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s16)"]
@@ -14030,7 +14247,14 @@ pub fn vminv_s16(a: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminvq_s16(a: int16x8_t) -> i16 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i16.v8i16"
+        )]
+        fn _vminvq_s16(a: int16x8_t) -> i16;
+    }
+    unsafe { _vminvq_s16(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s32)"]
@@ -14039,7 +14263,14 @@ pub fn vminvq_s16(a: int16x8_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminp))]
 pub fn vminv_s32(a: int32x2_t) -> i32 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i32.v2i32"
+        )]
+        fn _vminv_s32(a: int32x2_t) -> i32;
+    }
+    unsafe { _vminv_s32(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s32)"]
@@ -14048,7 +14279,14 @@ pub fn vminv_s32(a: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminvq_s32(a: int32x4_t) -> i32 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.sminv.i32.v4i32"
+        )]
+        fn _vminvq_s32(a: int32x4_t) -> i32;
+    }
+    unsafe { _vminvq_s32(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u8)"]
@@ -14057,7 +14295,14 @@ pub fn vminvq_s32(a: int32x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminv_u8(a: uint8x8_t) -> u8 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i8.v8i8"
+        )]
+        fn _vminv_u8(a: uint8x8_t) -> u8;
+    }
+    unsafe { _vminv_u8(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u8)"]
@@ -14066,7 +14311,14 @@ pub fn vminv_u8(a: uint8x8_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminvq_u8(a: uint8x16_t) -> u8 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i8.v16i8"
+        )]
+        fn _vminvq_u8(a: uint8x16_t) -> u8;
+    }
+    unsafe { _vminvq_u8(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u16)"]
@@ -14075,7 +14327,14 @@ pub fn vminvq_u8(a: uint8x16_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminv_u16(a: uint16x4_t) -> u16 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i16.v4i16"
+        )]
+        fn _vminv_u16(a: uint16x4_t) -> u16;
+    }
+    unsafe { _vminv_u16(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u16)"]
@@ -14084,7 +14343,14 @@ pub fn vminv_u16(a: uint16x4_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminvq_u16(a: uint16x8_t) -> u16 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i16.v8i16"
+        )]
+        fn _vminvq_u16(a: uint16x8_t) -> u16;
+    }
+    unsafe { _vminvq_u16(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u32)"]
@@ -14093,7 +14359,14 @@ pub fn vminvq_u16(a: uint16x8_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminp))]
 pub fn vminv_u32(a: uint32x2_t) -> u32 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i32.v2i32"
+        )]
+        fn _vminv_u32(a: uint32x2_t) -> u32;
+    }
+    unsafe { _vminv_u32(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u32)"]
@@ -14102,7 +14375,14 @@ pub fn vminv_u32(a: uint32x2_t) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminvq_u32(a: uint32x4_t) -> u32 {
-    unsafe { simd_reduce_min(a) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.uminv.i32.v4i32"
+        )]
+        fn _vminvq_u32(a: uint32x4_t) -> u32;
+    }
+    unsafe { _vminvq_u32(a) }
 }
 #[doc = "Floating-point multiply-add to accumulator"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_f64)"]
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
index a7c33917a88cc..4a846e2877462 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -25891,7 +25891,15 @@ pub fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 )]
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
-    unsafe { simd_fmax(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v4f16"
+        )]
+        fn _vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vmaxnm_f16(a, b) }
 }
 #[doc = "Floating-point Maximum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f16)"]
@@ -25913,7 +25921,15 @@ pub fn vmaxnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
 )]
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
-    unsafe { simd_fmax(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v8f16"
+        )]
+        fn _vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vmaxnmq_f16(a, b) }
 }
 #[doc = "Floating-point Maximum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnm_f32)"]
@@ -25934,7 +25950,15 @@ pub fn vmaxnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    unsafe { simd_fmax(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v2f32"
+        )]
+        fn _vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vmaxnm_f32(a, b) }
 }
 #[doc = "Floating-point Maximum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxnmq_f32)"]
@@ -25955,7 +25979,15 @@ pub fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    unsafe { simd_fmax(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fmaxnm.v4f32"
+        )]
+        fn _vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vmaxnmq_f32(a, b) }
 }
 #[doc = "Minimum (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f16)"]
@@ -26383,7 +26415,15 @@ pub fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
 )]
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
-    unsafe { simd_fmin(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v4f16"
+        )]
+        fn _vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t;
+    }
+    unsafe { _vminnm_f16(a, b) }
 }
 #[doc = "Floating-point Minimum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f16)"]
@@ -26405,7 +26445,15 @@ pub fn vminnm_f16(a: float16x4_t, b: float16x4_t) -> float16x4_t {
 )]
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
-    unsafe { simd_fmin(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v8f16")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v8f16"
+        )]
+        fn _vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    }
+    unsafe { _vminnmq_f16(a, b) }
 }
 #[doc = "Floating-point Minimum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnm_f32)"]
@@ -26426,7 +26474,15 @@ pub fn vminnmq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    unsafe { simd_fmin(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v2f32"
+        )]
+        fn _vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    unsafe { _vminnm_f32(a, b) }
 }
 #[doc = "Floating-point Minimum Number (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminnmq_f32)"]
@@ -26447,7 +26503,15 @@ pub fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    unsafe { simd_fmin(a, b) }
+    unsafe extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
+        #[cfg_attr(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            link_name = "llvm.aarch64.neon.fminnm.v4f32"
+        )]
+        fn _vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    unsafe { _vminnmq_f32(a, b) }
 }
 #[doc = "Floating-point multiply-add to accumulator"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_f32)"]
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index 8574aacee6671..2b4282e8035b1 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -6625,6 +6625,7 @@ intrinsics:
               arch: aarch64,arm64ec
 
 
+
   - name: "vmaxnm{neon_type.no}"
     doc: Floating-point Maximum Number (vector)
     arguments: ["a: {neon_type}", "b: {neon_type}"]
@@ -6636,7 +6637,11 @@ intrinsics:
       - float64x1_t
       - float64x2_t
     compose:
-      - FnCall: [simd_fmax, [a, b]]
+      - LLVMLink:
+          name: "fmaxnm.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
+              arch: aarch64,arm64ec
 
 
   - name: "vmaxnmh_{type}"
@@ -6652,7 +6657,11 @@ intrinsics:
     types:
       - f16
     compose:
-      - FnCall: ["f16::max", [a, b]]
+      - LLVMLink:
+          name: "vmaxh.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnm.{type}"
+              arch: aarch64,arm64ec
 
 
   - name: "vminnmh_{type}"
@@ -6668,7 +6677,11 @@ intrinsics:
     types:
       - f16
     compose:
-      - FnCall: ["f16::min", [a, b]]
+      - LLVMLink:
+          name: "vminh.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fminnm.{type}"
+              arch: aarch64,arm64ec
 
 
   - name: "vmaxnmv{neon_type[0].no}"
@@ -6682,7 +6695,11 @@ intrinsics:
       - [float32x2_t, f32]
       - [float64x2_t, f64]
     compose:
-      - FnCall: [simd_reduce_max, [a]]
+      - LLVMLink:
+          name: "fmaxnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vmaxnmv{neon_type[0].no}"
     doc: Floating-point maximum number across vector
@@ -6694,7 +6711,11 @@ intrinsics:
     types:
       - [float32x4_t, f32]
     compose:
-      - FnCall: [simd_reduce_max, [a]]
+      - LLVMLink:
+          name: "fmaxnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
 
   - name: "vmaxnmv{neon_type[0].no}"
@@ -6711,7 +6732,11 @@ intrinsics:
       - [float16x4_t, f16]
       - [float16x8_t, f16]
     compose:
-      - FnCall: [simd_reduce_max, [a]]
+      - LLVMLink:
+          name: "fmaxnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
 
   - name: "vminnmv{neon_type[0].no}"
@@ -6728,7 +6753,11 @@ intrinsics:
       - [float16x4_t, f16]
       - [float16x8_t, f16]
     compose:
-      - FnCall: [simd_reduce_min, [a]]
+      - LLVMLink:
+          name: "fminnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
 
   - name: "vmaxv{neon_type[0].no}"
@@ -6837,7 +6866,11 @@ intrinsics:
       - float64x1_t
       - float64x2_t
     compose:
-      - FnCall: [simd_fmin, [a, b]]
+      - LLVMLink:
+          name: "fminnm.{neon_type}"
+          links:
+            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
+              arch: aarch64,arm64ec
 
   - name: "vminnmv{neon_type[0].no}"
     doc: "Floating-point minimum number across vector"
@@ -6851,7 +6884,11 @@ intrinsics:
       - [float32x2_t, "f32"]
       - [float64x2_t, "f64"]
     compose:
-      - FnCall: [simd_reduce_min, [a]]
+      - LLVMLink:
+          name: "vminnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vminnmv{neon_type[0].no}"
     doc: "Floating-point minimum number across vector"
@@ -6864,7 +6901,11 @@ intrinsics:
     types:
       - [float32x4_t, "f32"]
     compose:
-      - FnCall: [simd_reduce_min, [a]]
+      - LLVMLink:
+          name: "vminnmv.{neon_type[0]}"
+          links:
+            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vmovl_high{neon_type[0].noq}"
     doc: Vector move
@@ -13372,7 +13413,11 @@ intrinsics:
       - [int16x8_t, i16, 'smaxv']
       - [int32x4_t, i32, 'smaxv']
     compose:
-      - FnCall: [simd_reduce_max, [a]]
+      - LLVMLink:
+          name: "vmaxv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.smaxv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vmaxv{neon_type[0].no}"
     doc: "Horizontal vector max."
@@ -13390,7 +13435,11 @@ intrinsics:
       - [uint16x8_t, u16, 'umaxv']
       - [uint32x4_t, u32, 'umaxv']
     compose:
-      - FnCall: [simd_reduce_max, [a]]
+      - LLVMLink:
+          name: "vmaxv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.umaxv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vmaxv{neon_type[0].no}"
     doc: "Horizontal vector max."
@@ -13427,7 +13476,11 @@ intrinsics:
       - [int16x8_t, i16, 'sminv']
       - [int32x4_t, i32, 'sminv']
     compose:
-      - FnCall: [simd_reduce_min, [a]]
+      - LLVMLink:
+          name: "vminv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.sminv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vminv{neon_type[0].no}"
     doc: "Horizontal vector min."
@@ -13445,7 +13498,11 @@ intrinsics:
       - [uint16x8_t, u16, 'uminv']
       - [uint32x4_t, u32, 'uminv']
     compose:
-      - FnCall: [simd_reduce_min, [a]]
+      - LLVMLink:
+          name: "vminv{neon_type[0].no}"
+          links:
+            - link: "llvm.aarch64.neon.uminv.{type[1]}.{neon_type[0]}"
+              arch: aarch64,arm64ec
 
   - name: "vminv{neon_type[0].no}"
     doc: "Horizontal vector min."
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index 5104ae607ccff..56b2252c9ef0c 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -7324,7 +7324,13 @@ intrinsics:
       - float32x2_t
       - float32x4_t
     compose:
-      - FnCall: [simd_fmax, [a, b]]
+      - LLVMLink:
+          name: "fmaxnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
+              arch: aarch64,arm64ec
 
 
   - name: "vmaxnm{neon_type.no}"
@@ -7344,7 +7350,13 @@ intrinsics:
       - float16x4_t
       - float16x8_t
     compose:
-      - FnCall: [simd_fmax, [a, b]]
+      - LLVMLink:
+          name: "fmaxnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vmaxnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
+              arch: aarch64,arm64ec
 
 
   - name: "vminnm{neon_type.no}"
@@ -7364,7 +7376,13 @@ intrinsics:
       - float16x4_t
       - float16x8_t
     compose:
-      - FnCall: [simd_fmin, [a, b]]
+      - LLVMLink:
+          name: "fminnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vminnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
+              arch: aarch64,arm64ec
 
 
   - name: "vmin{neon_type.no}"
@@ -7477,7 +7495,13 @@ intrinsics:
       - float32x2_t
       - float32x4_t
     compose:
-      - FnCall: [simd_fmin, [a, b]]
+      - LLVMLink:
+          name: "fminnm.{neon_type}"
+          links:
+            - link: "llvm.arm.neon.vminnm.{neon_type}"
+              arch: arm
+            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
+              arch: aarch64,arm64ec
 
   - name: "vpadd{neon_type.no}"
     doc: Floating-point add pairwise

From 46bada398f1067c831f8ce0c45cef9348a216b17 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Tue, 10 Mar 2026 13:03:00 +0100
Subject: [PATCH 09/18] aarch64: use `simd_reduce_{min, max}` on integers

---
 .../core_arch/src/aarch64/neon/generated.rs   | 216 ++----------------
 .../spec/neon/aarch64.spec.yml                |  24 +-
 2 files changed, 28 insertions(+), 212 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
index c4968a68c42f4..479a909a40a6e 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -13745,14 +13745,7 @@ pub fn vmaxvq_f64(a: float64x2_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxv_s8(a: int8x8_t) -> i8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"
-        )]
-        fn _vmaxv_s8(a: int8x8_t) -> i8;
-    }
-    unsafe { _vmaxv_s8(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s8)"]
@@ -13761,14 +13754,7 @@ pub fn vmaxv_s8(a: int8x8_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxvq_s8(a: int8x16_t) -> i8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.smaxv.i8.v16i8"
-        )]
-        fn _vmaxvq_s8(a: int8x16_t) -> i8;
-    }
-    unsafe { _vmaxvq_s8(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s16)"]
@@ -13777,14 +13763,7 @@ pub fn vmaxvq_s8(a: int8x16_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxv_s16(a: int16x4_t) -> i16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"
-        )]
-        fn _vmaxv_s16(a: int16x4_t) -> i16;
-    }
-    unsafe { _vmaxv_s16(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s16)"]
@@ -13793,14 +13772,7 @@ pub fn vmaxv_s16(a: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxvq_s16(a: int16x8_t) -> i16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"
-        )]
-        fn _vmaxvq_s16(a: int16x8_t) -> i16;
-    }
-    unsafe { _vmaxvq_s16(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_s32)"]
@@ -13809,14 +13781,7 @@ pub fn vmaxvq_s16(a: int16x8_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxp))]
 pub fn vmaxv_s32(a: int32x2_t) -> i32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"
-        )]
-        fn _vmaxv_s32(a: int32x2_t) -> i32;
-    }
-    unsafe { _vmaxv_s32(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_s32)"]
@@ -13825,14 +13790,7 @@ pub fn vmaxv_s32(a: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(smaxv))]
 pub fn vmaxvq_s32(a: int32x4_t) -> i32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"
-        )]
-        fn _vmaxvq_s32(a: int32x4_t) -> i32;
-    }
-    unsafe { _vmaxvq_s32(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u8)"]
@@ -13841,14 +13799,7 @@ pub fn vmaxvq_s32(a: int32x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxv_u8(a: uint8x8_t) -> u8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"
-        )]
-        fn _vmaxv_u8(a: uint8x8_t) -> u8;
-    }
-    unsafe { _vmaxv_u8(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u8)"]
@@ -13857,14 +13808,7 @@ pub fn vmaxv_u8(a: uint8x8_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxvq_u8(a: uint8x16_t) -> u8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.umaxv.i8.v16i8"
-        )]
-        fn _vmaxvq_u8(a: uint8x16_t) -> u8;
-    }
-    unsafe { _vmaxvq_u8(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u16)"]
@@ -13873,14 +13817,7 @@ pub fn vmaxvq_u8(a: uint8x16_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxv_u16(a: uint16x4_t) -> u16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"
-        )]
-        fn _vmaxv_u16(a: uint16x4_t) -> u16;
-    }
-    unsafe { _vmaxv_u16(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u16)"]
@@ -13889,14 +13826,7 @@ pub fn vmaxv_u16(a: uint16x4_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxvq_u16(a: uint16x8_t) -> u16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"
-        )]
-        fn _vmaxvq_u16(a: uint16x8_t) -> u16;
-    }
-    unsafe { _vmaxvq_u16(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxv_u32)"]
@@ -13905,14 +13835,7 @@ pub fn vmaxvq_u16(a: uint16x8_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxp))]
 pub fn vmaxv_u32(a: uint32x2_t) -> u32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"
-        )]
-        fn _vmaxv_u32(a: uint32x2_t) -> u32;
-    }
-    unsafe { _vmaxv_u32(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Horizontal vector max."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32)"]
@@ -13921,14 +13844,7 @@ pub fn vmaxv_u32(a: uint32x2_t) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(umaxv))]
 pub fn vmaxvq_u32(a: uint32x4_t) -> u32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"
-        )]
-        fn _vmaxvq_u32(a: uint32x4_t) -> u32;
-    }
-    unsafe { _vmaxvq_u32(a) }
+    unsafe { simd_reduce_max(a) }
 }
 #[doc = "Minimum (vector)"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmin_f64)"]
@@ -14199,14 +14115,7 @@ pub fn vminvq_f64(a: float64x2_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminv_s8(a: int8x8_t) -> i8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.sminv.i8.v8i8"
-        )]
-        fn _vminv_s8(a: int8x8_t) -> i8;
-    }
-    unsafe { _vminv_s8(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s8)"]
@@ -14215,14 +14124,7 @@ pub fn vminv_s8(a: int8x8_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminvq_s8(a: int8x16_t) -> i8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.sminv.i8.v16i8"
-        )]
-        fn _vminvq_s8(a: int8x16_t) -> i8;
-    }
-    unsafe { _vminvq_s8(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s16)"]
@@ -14231,14 +14133,7 @@ pub fn vminvq_s8(a: int8x16_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminv_s16(a: int16x4_t) -> i16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.sminv.i16.v4i16"
-        )]
-        fn _vminv_s16(a: int16x4_t) -> i16;
-    }
-    unsafe { _vminv_s16(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s16)"]
@@ -14247,14 +14142,7 @@ pub fn vminv_s16(a: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminvq_s16(a: int16x8_t) -> i16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.sminv.i16.v8i16"
-        )]
-        fn _vminvq_s16(a: int16x8_t) -> i16;
-    }
-    unsafe { _vminvq_s16(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_s32)"]
@@ -14263,14 +14151,7 @@ pub fn vminvq_s16(a: int16x8_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminp))]
 pub fn vminv_s32(a: int32x2_t) -> i32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.sminv.i32.v2i32"
-        )]
-        fn _vminv_s32(a: int32x2_t) -> i32;
-    }
-    unsafe { _vminv_s32(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_s32)"]
@@ -14279,14 +14160,7 @@ pub fn vminv_s32(a: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(sminv))]
 pub fn vminvq_s32(a: int32x4_t) -> i32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.sminv.i32.v4i32"
-        )]
-        fn _vminvq_s32(a: int32x4_t) -> i32;
-    }
-    unsafe { _vminvq_s32(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u8)"]
@@ -14295,14 +14169,7 @@ pub fn vminvq_s32(a: int32x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminv_u8(a: uint8x8_t) -> u8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.uminv.i8.v8i8"
-        )]
-        fn _vminv_u8(a: uint8x8_t) -> u8;
-    }
-    unsafe { _vminv_u8(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u8)"]
@@ -14311,14 +14178,7 @@ pub fn vminv_u8(a: uint8x8_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminvq_u8(a: uint8x16_t) -> u8 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.uminv.i8.v16i8"
-        )]
-        fn _vminvq_u8(a: uint8x16_t) -> u8;
-    }
-    unsafe { _vminvq_u8(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u16)"]
@@ -14327,14 +14187,7 @@ pub fn vminvq_u8(a: uint8x16_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminv_u16(a: uint16x4_t) -> u16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.uminv.i16.v4i16"
-        )]
-        fn _vminv_u16(a: uint16x4_t) -> u16;
-    }
-    unsafe { _vminv_u16(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u16)"]
@@ -14343,14 +14196,7 @@ pub fn vminv_u16(a: uint16x4_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminvq_u16(a: uint16x8_t) -> u16 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.uminv.i16.v8i16"
-        )]
-        fn _vminvq_u16(a: uint16x8_t) -> u16;
-    }
-    unsafe { _vminvq_u16(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminv_u32)"]
@@ -14359,14 +14205,7 @@ pub fn vminvq_u16(a: uint16x8_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminp))]
 pub fn vminv_u32(a: uint32x2_t) -> u32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.uminv.i32.v2i32"
-        )]
-        fn _vminv_u32(a: uint32x2_t) -> u32;
-    }
-    unsafe { _vminv_u32(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Horizontal vector min."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vminvq_u32)"]
@@ -14375,14 +14214,7 @@ pub fn vminv_u32(a: uint32x2_t) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(uminv))]
 pub fn vminvq_u32(a: uint32x4_t) -> u32 {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.uminv.i32.v4i32"
-        )]
-        fn _vminvq_u32(a: uint32x4_t) -> u32;
-    }
-    unsafe { _vminvq_u32(a) }
+    unsafe { simd_reduce_min(a) }
 }
 #[doc = "Floating-point multiply-add to accumulator"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmla_f64)"]
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index 2b4282e8035b1..f6f3e029f22e4 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -13413,11 +13413,7 @@ intrinsics:
       - [int16x8_t, i16, 'smaxv']
       - [int32x4_t, i32, 'smaxv']
     compose:
-      - LLVMLink:
-          name: "vmaxv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.smaxv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
   - name: "vmaxv{neon_type[0].no}"
     doc: "Horizontal vector max."
@@ -13435,11 +13431,7 @@ intrinsics:
       - [uint16x8_t, u16, 'umaxv']
       - [uint32x4_t, u32, 'umaxv']
     compose:
-      - LLVMLink:
-          name: "vmaxv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.umaxv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
   - name: "vmaxv{neon_type[0].no}"
     doc: "Horizontal vector max."
@@ -13476,11 +13468,7 @@ intrinsics:
       - [int16x8_t, i16, 'sminv']
       - [int32x4_t, i32, 'sminv']
     compose:
-      - LLVMLink:
-          name: "vminv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.sminv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
   - name: "vminv{neon_type[0].no}"
     doc: "Horizontal vector min."
@@ -13498,11 +13486,7 @@ intrinsics:
       - [uint16x8_t, u16, 'uminv']
       - [uint32x4_t, u32, 'uminv']
     compose:
-      - LLVMLink:
-          name: "vminv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.uminv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
   - name: "vminv{neon_type[0].no}"
     doc: "Horizontal vector min."

From 752a87fb4bc4b3ac702612c445b1869178caad85 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 10 Mar 2026 17:42:37 +0100
Subject: [PATCH 10/18] s390x: use llvm.s390 intrinsics instead of
 simd_fmin/fmax

---
 .../crates/core_arch/src/s390x/vector.rs      | 37 +++++++++++++++++--
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/s390x/vector.rs b/library/stdarch/crates/core_arch/src/s390x/vector.rs
index 346cd674df665..2f31eb48f88e4 100644
--- a/library/stdarch/crates/core_arch/src/s390x/vector.rs
+++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs
@@ -335,6 +335,11 @@ unsafe extern "unadjusted" {
     #[link_name = "llvm.s390.vcfn"] fn vcfn(a: vector_signed_short, immarg: i32) -> vector_signed_short;
     #[link_name = "llvm.s390.vcnf"] fn vcnf(a: vector_signed_short, immarg: i32) -> vector_signed_short;
     #[link_name = "llvm.s390.vcrnfs"] fn vcrnfs(a: vector_float, b: vector_float, immarg: i32) -> vector_signed_short;
+
+    #[link_name = "llvm.s390.vfmaxsb"] fn vfmaxsb(a: vector_float, b: vector_float, mode: i32) -> vector_float;
+    #[link_name = "llvm.s390.vfmaxdb"] fn vfmaxdb(a: vector_double, b: vector_double, mode: i32) -> vector_double;
+    #[link_name = "llvm.s390.vfminsb"] fn vfminsb(a: vector_float, b: vector_float, mode: i32) -> vector_float;
+    #[link_name = "llvm.s390.vfmindb"] fn vfmindb(a: vector_double, b: vector_double, mode: i32) -> vector_double;
 }
 
 #[repr(simd)]
@@ -780,8 +785,20 @@ mod sealed {
         impl_max!(vec_vmxslg, vector_unsigned_long_long, vmxlg);
     }
 
-    test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [simd_fmax, "vector-enhancements-1" vfmaxsb ] }
-    test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [simd_fmax, "vector-enhancements-1" vfmaxdb] }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    unsafe fn vfmaxsb_m0(a: vector_float, b: vector_float) -> vector_float {
+        // clang uses mode 0 for `vec_max`, so we do the same.
+        vfmaxsb(a, b, const { 0 })
+    }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    unsafe fn vfmaxdb_m0(a: vector_double, b: vector_double) -> vector_double {
+        vfmaxdb(a, b, const { 0 })
+    }
+
+    test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [vfmaxsb_m0, "vector-enhancements-1" vfmaxsb ] }
+    test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [vfmaxdb_m0, "vector-enhancements-1" vfmaxdb] }
 
     impl_vec_trait!([VectorMax vec_max] vec_vfmaxsb (vector_float, vector_float) -> vector_float);
     impl_vec_trait!([VectorMax vec_max] vec_vfmaxdb (vector_double, vector_double) -> vector_double);
@@ -827,8 +844,20 @@ mod sealed {
         impl_min!(vec_vmnslg, vector_unsigned_long_long, vmnlg);
     }
 
-    test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [simd_fmin, "vector-enhancements-1" vfminsb]  }
-    test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [simd_fmin, "vector-enhancements-1" vfmindb]  }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    unsafe fn vfminsb_m0(a: vector_float, b: vector_float) -> vector_float {
+        // clang uses mode 0 for `vec_min`, so we do the same.
+        vfminsb(a, b, const { 0 })
+    }
+    #[inline]
+    #[target_feature(enable = "vector")]
+    unsafe fn vfmindb_m0(a: vector_double, b: vector_double) -> vector_double {
+        vfmindb(a, b, const { 0 })
+    }
+
+    test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [vfminsb_m0, "vector-enhancements-1" vfminsb]  }
+    test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [vfmindb_m0, "vector-enhancements-1" vfmindb]  }
 
     impl_vec_trait!([VectorMin vec_min] vec_vfminsb (vector_float, vector_float) -> vector_float);
     impl_vec_trait!([VectorMin vec_min] vec_vfmindb (vector_double, vector_double) -> vector_double);

From 47d82e245a2da73f9207ef476995a0c3b090e677 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 10 Mar 2026 23:26:51 +0100
Subject: [PATCH 11/18] add f32 min/max tests

---
 .../stdarch/crates/core_arch/src/s390x/vector.rs    | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/library/stdarch/crates/core_arch/src/s390x/vector.rs b/library/stdarch/crates/core_arch/src/s390x/vector.rs
index 2f31eb48f88e4..33921eca5f2aa 100644
--- a/library/stdarch/crates/core_arch/src/s390x/vector.rs
+++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs
@@ -7506,6 +7506,19 @@ mod tests {
         [0, !0, !0, !0]
     }
 
+    // f32 is the tricky case for max/min as that needs a fallback on z13
+    test_vec_2! { test_vec_max, vec_max, f32x4, f32x4 -> f32x4,
+        [1.0,   f32::NAN, f32::INFINITY, 2.0],
+        [-10.0, -10.0,    5.0,           f32::NAN],
+        [1.0,   -10.0,    f32::INFINITY, 2.0]
+    }
+
+    test_vec_2! { test_vec_min, vec_min, f32x4, f32x4 -> f32x4,
+        [1.0,   f32::NAN, f32::INFINITY, 2.0],
+        [-10.0, -10.0,    5.0,           f32::NAN],
+        [-10.0, -10.0,    5.0,           2.0]
+    }
+
     #[simd_test(enable = "vector")]
     fn test_vec_meadd() {
         let a = vector_unsigned_short([1, 0, 2, 0, 3, 0, 4, 0]);

From a6687175c9c177457602efc47c0d52e0e6a69d52 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Wed, 11 Mar 2026 08:52:03 +0100
Subject: [PATCH 12/18] go back to portable LLVM intrinsic to avoid fallback
 trouble

---
 .../crates/core_arch/src/s390x/vector.rs      | 49 +++++++------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/s390x/vector.rs b/library/stdarch/crates/core_arch/src/s390x/vector.rs
index 33921eca5f2aa..31b9dc5eac70b 100644
--- a/library/stdarch/crates/core_arch/src/s390x/vector.rs
+++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs
@@ -336,10 +336,19 @@ unsafe extern "unadjusted" {
     #[link_name = "llvm.s390.vcnf"] fn vcnf(a: vector_signed_short, immarg: i32) -> vector_signed_short;
     #[link_name = "llvm.s390.vcrnfs"] fn vcrnfs(a: vector_float, b: vector_float, immarg: i32) -> vector_signed_short;
 
-    #[link_name = "llvm.s390.vfmaxsb"] fn vfmaxsb(a: vector_float, b: vector_float, mode: i32) -> vector_float;
-    #[link_name = "llvm.s390.vfmaxdb"] fn vfmaxdb(a: vector_double, b: vector_double, mode: i32) -> vector_double;
-    #[link_name = "llvm.s390.vfminsb"] fn vfminsb(a: vector_float, b: vector_float, mode: i32) -> vector_float;
-    #[link_name = "llvm.s390.vfmindb"] fn vfmindb(a: vector_double, b: vector_double, mode: i32) -> vector_double;
+    // These are the intrinsics we'd like to use (with mode 0). However, they require
+    // "vector-enhancements-1" and don't have a fallback, whereas `vec_min`/`vec_max` should be
+    // available with just "vector". Therefore, we cannot use them.
+    // #[link_name = "llvm.s390.vfmaxsb"] fn vfmaxsb(a: vector_float, b: vector_float, mode: i32) -> vector_float;
+    // #[link_name = "llvm.s390.vfmaxdb"] fn vfmaxdb(a: vector_double, b: vector_double, mode: i32) -> vector_double;
+    // #[link_name = "llvm.s390.vfminsb"] fn vfminsb(a: vector_float, b: vector_float, mode: i32) -> vector_float;
+    // #[link_name = "llvm.s390.vfmindb"] fn vfmindb(a: vector_double, b: vector_double, mode: i32) -> vector_double;
+    // Instead, we use "portable" LLVM intrinsics -- even though those have the wrong semantics
+    // (https://github.com/rust-lang/stdarch/issues/2060), they usually do the right thing.
+    #[link_name = "llvm.minnum.v4f32"] fn minnum_v4f32(a: vector_float, b: vector_float) -> vector_float;
+    #[link_name = "llvm.minnum.v2f64"] fn minnum_v2f64(a: vector_double, b: vector_double) -> vector_double;
+    #[link_name = "llvm.maxnum.v4f32"] fn maxnum_v4f32(a: vector_float, b: vector_float) -> vector_float;
+    #[link_name = "llvm.maxnum.v2f64"] fn maxnum_v2f64(a: vector_double, b: vector_double) -> vector_double;
 }
 
 #[repr(simd)]
@@ -785,20 +794,8 @@ mod sealed {
         impl_max!(vec_vmxslg, vector_unsigned_long_long, vmxlg);
     }
 
-    #[inline]
-    #[target_feature(enable = "vector")]
-    unsafe fn vfmaxsb_m0(a: vector_float, b: vector_float) -> vector_float {
-        // clang uses mode 0 for `vec_max`, so we do the same.
-        vfmaxsb(a, b, const { 0 })
-    }
-    #[inline]
-    #[target_feature(enable = "vector")]
-    unsafe fn vfmaxdb_m0(a: vector_double, b: vector_double) -> vector_double {
-        vfmaxdb(a, b, const { 0 })
-    }
-
-    test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [vfmaxsb_m0, "vector-enhancements-1" vfmaxsb ] }
-    test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [vfmaxdb_m0, "vector-enhancements-1" vfmaxdb] }
+    test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [maxnum_v4f32, "vector-enhancements-1" vfmaxsb] }
+    test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [maxnum_v2f64, "vector-enhancements-1" vfmaxdb] }
 
     impl_vec_trait!([VectorMax vec_max] vec_vfmaxsb (vector_float, vector_float) -> vector_float);
     impl_vec_trait!([VectorMax vec_max] vec_vfmaxdb (vector_double, vector_double) -> vector_double);
@@ -844,20 +841,8 @@ mod sealed {
         impl_min!(vec_vmnslg, vector_unsigned_long_long, vmnlg);
     }
 
-    #[inline]
-    #[target_feature(enable = "vector")]
-    unsafe fn vfminsb_m0(a: vector_float, b: vector_float) -> vector_float {
-        // clang uses mode 0 for `vec_min`, so we do the same.
-        vfminsb(a, b, const { 0 })
-    }
-    #[inline]
-    #[target_feature(enable = "vector")]
-    unsafe fn vfmindb_m0(a: vector_double, b: vector_double) -> vector_double {
-        vfmindb(a, b, const { 0 })
-    }
-
-    test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [vfminsb_m0, "vector-enhancements-1" vfminsb]  }
-    test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [vfmindb_m0, "vector-enhancements-1" vfmindb]  }
+    test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [minnum_v4f32, "vector-enhancements-1" vfminsb] }
+    test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [minnum_v2f64, "vector-enhancements-1" vfmindb] }
 
     impl_vec_trait!([VectorMin vec_min] vec_vfminsb (vector_float, vector_float) -> vector_float);
     impl_vec_trait!([VectorMin vec_min] vec_vfmindb (vector_double, vector_double) -> vector_double);

From 80b869876a07179e400e3aba4e53465fc5ecd4c5 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Wed, 11 Mar 2026 18:31:49 +0100
Subject: [PATCH 13/18] s390x: add f64 tests for vec_min

---
 .../crates/core_arch/src/s390x/vector.rs        | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/s390x/vector.rs b/library/stdarch/crates/core_arch/src/s390x/vector.rs
index 31b9dc5eac70b..376c912c04090 100644
--- a/library/stdarch/crates/core_arch/src/s390x/vector.rs
+++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs
@@ -7491,19 +7491,30 @@ mod tests {
         [0, !0, !0, !0]
     }
 
-    // f32 is the tricky case for max/min as that needs a fallback on z13
-    test_vec_2! { test_vec_max, vec_max, f32x4, f32x4 -> f32x4,
+    test_vec_2! { test_vec_max_f32, vec_max, f32x4, f32x4 -> f32x4,
         [1.0,   f32::NAN, f32::INFINITY, 2.0],
         [-10.0, -10.0,    5.0,           f32::NAN],
         [1.0,   -10.0,    f32::INFINITY, 2.0]
     }
 
-    test_vec_2! { test_vec_min, vec_min, f32x4, f32x4 -> f32x4,
+    test_vec_2! { test_vec_min_f32, vec_min, f32x4, f32x4 -> f32x4,
         [1.0,   f32::NAN, f32::INFINITY, 2.0],
         [-10.0, -10.0,    5.0,           f32::NAN],
         [-10.0, -10.0,    5.0,           2.0]
     }
 
+    test_vec_2! { test_vec_max_f64, vec_max, f64x2, f64x2 -> f64x2,
+        [f64::NAN, 2.0],
+        [-10.0,    f64::NAN],
+        [-10.0,    2.0]
+    }
+
+    test_vec_2! { test_vec_min_f64, vec_min, f64x2, f64x2 -> f64x2,
+        [f64::NAN, 2.0],
+        [-10.0,    f64::NAN],
+        [-10.0,    2.0]
+    }
+
     #[simd_test(enable = "vector")]
     fn test_vec_meadd() {
         let a = vector_unsigned_short([1, 0, 2, 0, 3, 0, 4, 0]);

From 4f10c640d16e79e17924cfe6f2f60f45d43b1cac Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Thu, 12 Mar 2026 10:37:52 +0100
Subject: [PATCH 14/18] remove `cfg_attr` on `aarch64`/`arm64ec` in the aarch64
 module

---
 .../crates/core_arch/src/aarch64/mte.rs       | 30 ++++---------------
 .../crates/core_arch/src/aarch64/rand.rs      | 10 ++-----
 2 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/mte.rs b/library/stdarch/crates/core_arch/src/aarch64/mte.rs
index c400f774bcce0..1b05eb3498efa 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/mte.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/mte.rs
@@ -3,35 +3,17 @@
 //! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#markdown-toc-mte-intrinsics)
 
 unsafe extern "unadjusted" {
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.irg"
-    )]
+    #[link_name = "llvm.aarch64.irg"]
     fn irg_(ptr: *const (), exclude: i64) -> *const ();
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.gmi"
-    )]
+    #[link_name = "llvm.aarch64.gmi"]
     fn gmi_(ptr: *const (), exclude: i64) -> i64;
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.ldg"
-    )]
+    #[link_name = "llvm.aarch64.ldg"]
     fn ldg_(ptr: *const (), tag_ptr: *const ()) -> *const ();
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.stg"
-    )]
+    #[link_name = "llvm.aarch64.stg"]
     fn stg_(tagged_ptr: *const (), addr_to_tag: *const ());
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.addg"
-    )]
+    #[link_name = "llvm.aarch64.addg"]
     fn addg_(ptr: *const (), value: i64) -> *const ();
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.subp"
-    )]
+    #[link_name = "llvm.aarch64.subp"]
     fn subp_(ptr_a: *const (), ptr_b: *const ()) -> i64;
 }
 
diff --git a/library/stdarch/crates/core_arch/src/aarch64/rand.rs b/library/stdarch/crates/core_arch/src/aarch64/rand.rs
index 5492fd014401a..17b616f4ecf4c 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/rand.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/rand.rs
@@ -3,16 +3,10 @@
 //! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#random-number-generation-intrinsics)
 
 unsafe extern "unadjusted" {
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.rndr"
-    )]
+    #[link_name = "llvm.aarch64.rndr"]
     fn rndr_() -> Tuple;
 
-    #[cfg_attr(
-        any(target_arch = "aarch64", target_arch = "arm64ec"),
-        link_name = "llvm.aarch64.rndrrs"
-    )]
+    #[link_name = "llvm.aarch64.rndrrs"]
     fn rndrrs_() -> Tuple;
 }
 

From 30c24f45301ae5636dc84c4336d62fa7b8134224 Mon Sep 17 00:00:00 2001
From: anonymous <you@example.com>
Date: Thu, 12 Mar 2026 16:05:20 -0700
Subject: [PATCH 15/18] ci: update to actions/checkout@v6

ci is showing a lot of warnings (72) right now. apparently
actions/checkout@v4 uses Node.js 20, and all github actions are
scheduled to be force opted-in to Node.js 24 on 2026-06-02.
I don't anticipate bumping the checkout action to v6 / Node.js 24
to cause any issues (Node.js 24 drops support for ARM32 and macOS
versions <= 13.4, but this shouldn't matter because we use Docker
to test in those environments, not github runners natively) but if
it does cause issues it's probably better to find out now rather
than by surprise 3 months from now... :)
---
 library/stdarch/.github/workflows/main.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/library/stdarch/.github/workflows/main.yml b/library/stdarch/.github/workflows/main.yml
index 0ec355aa3ca4f..3749ed1f6ac81 100644
--- a/library/stdarch/.github/workflows/main.yml
+++ b/library/stdarch/.github/workflows/main.yml
@@ -8,7 +8,7 @@ jobs:
     name: Check Style
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install Rust
       run: rustup update nightly --no-self-update && rustup default nightly
     - run: ci/style.sh
@@ -18,7 +18,7 @@ jobs:
     needs: [style]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install Rust
       run: rustup update nightly --no-self-update && rustup default nightly
     - run: ci/dox.sh
@@ -30,7 +30,7 @@ jobs:
     needs: [style]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install Rust
       run: rustup update nightly --no-self-update && rustup default nightly
     - run: cargo test --manifest-path crates/stdarch-verify/Cargo.toml
@@ -216,7 +216,7 @@ jobs:
           build_std: true
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install Rust
       run: |
         rustup update nightly --no-self-update
@@ -285,7 +285,7 @@ jobs:
             build_std: true
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install Rust
       run: |
         rustup update nightly --no-self-update
@@ -310,7 +310,7 @@ jobs:
     name: Check stdarch-gen-{arm, loongarch, hexagon} output
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Install Rust
       run: rustup update nightly && rustup default nightly && rustup component add rustfmt
     - name: Check arm spec

From ccf7d4e978b1843ba6c6f448934791e35b0e2a7f Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Thu, 12 Mar 2026 12:32:35 +0100
Subject: [PATCH 16/18] inline `assert_instr` tests

---
 .../crates/core_arch/src/aarch64/mte.rs       | 16 +++++++-----
 .../crates/core_arch/src/aarch64/rand.rs      | 25 ++++---------------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/mte.rs b/library/stdarch/crates/core_arch/src/aarch64/mte.rs
index 1b05eb3498efa..a5031a45c1a0e 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/mte.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/mte.rs
@@ -109,42 +109,46 @@ mod test {
     use super::*;
     use stdarch_test::assert_instr;
 
-    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(irg))] // FIXME: MSVC  `dumpbin` doesn't support MTE
+    // Instruction tests are separate because the functions use generics.
+    //
+    // FIXME: As of 2026 MSVC  `dumpbin` doesn't support MTE.
+
+    #[cfg_attr(not(target_env = "msvc"), assert_instr(irg))]
     #[allow(dead_code)]
     #[target_feature(enable = "mte")]
     unsafe fn test_arm_mte_create_random_tag(src: *const (), mask: u64) -> *const () {
         __arm_mte_create_random_tag(src, mask)
     }
 
-    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(addg))]
+    #[cfg_attr(not(target_env = "msvc"), assert_instr(addg))]
     #[allow(dead_code)]
     #[target_feature(enable = "mte")]
     unsafe fn test_arm_mte_increment_tag(src: *const ()) -> *const () {
         __arm_mte_increment_tag::<1, _>(src)
     }
 
-    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(gmi))]
+    #[cfg_attr(not(target_env = "msvc"), assert_instr(gmi))]
     #[allow(dead_code)]
     #[target_feature(enable = "mte")]
     unsafe fn test_arm_mte_exclude_tag(src: *const (), excluded: u64) -> u64 {
         __arm_mte_exclude_tag(src, excluded)
     }
 
-    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(stg))]
+    #[cfg_attr(not(target_env = "msvc"), assert_instr(stg))]
     #[allow(dead_code)]
     #[target_feature(enable = "mte")]
     unsafe fn test_arm_mte_set_tag(src: *const ()) {
         __arm_mte_set_tag(src)
     }
 
-    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(ldg))]
+    #[cfg_attr(not(target_env = "msvc"), assert_instr(ldg))]
     #[allow(dead_code)]
     #[target_feature(enable = "mte")]
     unsafe fn test_arm_mte_get_tag(src: *const ()) -> *const () {
         __arm_mte_get_tag(src)
     }
 
-    #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(subp))]
+    #[cfg_attr(not(target_env = "msvc"), assert_instr(subp))]
     #[allow(dead_code)]
     #[target_feature(enable = "mte")]
     unsafe fn test_arm_mte_ptrdiff(a: *const (), b: *const ()) -> i64 {
diff --git a/library/stdarch/crates/core_arch/src/aarch64/rand.rs b/library/stdarch/crates/core_arch/src/aarch64/rand.rs
index 17b616f4ecf4c..3f52cf2ce8657 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/rand.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/rand.rs
@@ -2,6 +2,9 @@
 //!
 //! [ACLE documentation](https://arm-software.github.io/acle/main/acle.html#random-number-generation-intrinsics)
 
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
 unsafe extern "unadjusted" {
     #[link_name = "llvm.aarch64.rndr"]
     fn rndr_() -> Tuple;
@@ -22,6 +25,7 @@ struct Tuple {
 /// is returned.
 #[inline]
 #[target_feature(enable = "rand")]
+#[cfg_attr(test, assert_instr(mrs))]
 #[unstable(feature = "stdarch_aarch64_rand", issue = "153514")]
 pub unsafe fn __rndr(value: *mut u64) -> i32 {
     let Tuple { bits, status } = rndr_();
@@ -35,29 +39,10 @@ pub unsafe fn __rndr(value: *mut u64) -> i32 {
 /// to by the input is set to zero and a non-zero value is returned.
 #[inline]
 #[target_feature(enable = "rand")]
+#[cfg_attr(test, assert_instr(mrs))]
 #[unstable(feature = "stdarch_aarch64_rand", issue = "153514")]
 pub unsafe fn __rndrrs(value: *mut u64) -> i32 {
     let Tuple { bits, status } = rndrrs_();
     unsafe { *value = bits };
     status as i32
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-    use stdarch_test::assert_instr;
-
-    #[cfg_attr(test, assert_instr(mrs))]
-    #[allow(dead_code)]
-    #[target_feature(enable = "rand")]
-    unsafe fn test_rndr(value: &mut u64) -> i32 {
-        __rndr(value)
-    }
-
-    #[cfg_attr(test, assert_instr(mrs))]
-    #[allow(dead_code)]
-    #[target_feature(enable = "rand")]
-    unsafe fn test_rndrrs(value: &mut u64) -> i32 {
-        __rndrrs(value)
-    }
-}

From 8155209828619951db645443e720e3a33c4046b9 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Mon, 16 Mar 2026 23:28:20 +0100
Subject: [PATCH 17/18] enable the `movrs` target feature in `core` and `std`

---
 library/core/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
index 29869dd91982d..6ce1432011b47 100644
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@@ -182,6 +182,7 @@
 #![feature(hexagon_target_feature)]
 #![feature(loongarch_target_feature)]
 #![feature(mips_target_feature)]
+#![feature(movrs_target_feature)]
 #![feature(nvptx_target_feature)]
 #![feature(powerpc_target_feature)]
 #![feature(riscv_target_feature)]

From a72aceea2b526dd098b8e80cf24ee7485f7429b8 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Tue, 17 Mar 2026 01:00:25 +0100
Subject: [PATCH 18/18] correct `vpdpbusd` asserts in miri

in https://github.com/rust-lang/rust/commit/a24022ad4e98bfc5adc47cc114db57b68c8511d2 we changed the argument types to be more accurate, and now the miri asserts on the simd type/size need to reflect that
---
 src/tools/miri/src/shims/x86/avx512.rs | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/tools/miri/src/shims/x86/avx512.rs b/src/tools/miri/src/shims/x86/avx512.rs
index b057a78b6c8ee..23538f0dea965 100644
--- a/src/tools/miri/src/shims/x86/avx512.rs
+++ b/src/tools/miri/src/shims/x86/avx512.rs
@@ -188,23 +188,26 @@ fn vpdpbusd<'tcx>(
     let (b, b_len) = ecx.project_to_simd(b)?;
     let (dest, dest_len) = ecx.project_to_simd(dest)?;
 
-    // fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    // fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    // fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    // fn vpdpbusd(src: i32x16, a: u8x64, b: i8x64) -> i32x16;
+    // fn vpdpbusd256(src: i32x8, a: u8x32, b: i8x32) -> i32x8;
+    // fn vpdpbusd128(src: i32x4, a: u8x16, b: i8x16) -> i32x4;
     assert_eq!(dest_len, src_len);
-    assert_eq!(dest_len, a_len);
-    assert_eq!(dest_len, b_len);
+    assert_eq!(dest_len * 4, a_len);
+    assert_eq!(a_len, b_len);
 
     for i in 0..dest_len {
         let src = ecx.read_scalar(&ecx.project_index(&src, i)?)?.to_i32()?;
-        let a = ecx.read_scalar(&ecx.project_index(&a, i)?)?.to_u32()?;
-        let b = ecx.read_scalar(&ecx.project_index(&b, i)?)?.to_u32()?;
         let dest = ecx.project_index(&dest, i)?;
 
-        let zipped = a.to_le_bytes().into_iter().zip(b.to_le_bytes());
-        let intermediate_sum: i32 = zipped
-            .map(|(a, b)| i32::from(a).strict_mul(i32::from(b.cast_signed())))
-            .fold(0, |x, y| x.strict_add(y));
+        let mut intermediate_sum: i32 = 0;
+        for j in 0..4 {
+            let idx = i.strict_mul(4).strict_add(j);
+            let a = ecx.read_scalar(&ecx.project_index(&a, idx)?)?.to_u8()?;
+            let b = ecx.read_scalar(&ecx.project_index(&b, idx)?)?.to_i8()?;
+
+            let product = i32::from(a).strict_mul(i32::from(b));
+            intermediate_sum = intermediate_sum.strict_add(product);
+        }
 
         // Use `wrapping_add` because `src` is an arbitrary i32 and the addition can overflow.
         let res = Scalar::from_i32(intermediate_sum.wrapping_add(src));