Merge pull request #2012 from folkertdev/llvm-22-hadds-hsubs

x86: use `intrinsics::simd` for `hadds`/`hsubs`
2026-02-01 07:10:41 +00:00 · 2026-02-01 07:10:41 +00:00 · 4a1094cd11
commit 4a1094cd11
parent c1196d4333 e40ae4fe2c
2 changed files with 44 additions and 14 deletions
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@ -991,7 +991,21 @@ pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphaddsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_saturating_add(even, odd).as_m256i()
+    }
 }

 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
@ -1047,7 +1061,21 @@ pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphsubsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_saturating_sub(even, odd).as_m256i()
+    }
 }

 /// Returns values from `slice` at offsets determined by `offsets * scale`,
@ -3791,10 +3819,6 @@ pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {

 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.avx2.phadd.sw"]
-    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.sw"]
-    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
    #[link_name = "llvm.x86.avx2.pmadd.wd"]
    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
--- a/library/stdarch/crates/core_arch/src/x86/ssse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@ -188,7 +188,13 @@ pub const fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phaddsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_saturating_add(even, odd).as_m128i()
+    }
 }

 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@ -240,7 +246,13 @@ pub const fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phsubsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_saturating_sub(even, odd).as_m128i()
+    }
 }

 /// Horizontally subtract the adjacent pairs of values contained in 2
@ -337,12 +349,6 @@ unsafe extern "C" {
    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;

-    #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
-    fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
-
-    #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
-    fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
-
    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
    fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;