From e4363c287d134c1b9b36f526a759c8664f83f4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 23:00:22 +0200 Subject: [PATCH] Reimplement _mm_mulhi_epi16, _mm_mulhi_epu16, _mm256_mulhi_epi16 and _mm256_mulhi_epu16 without LLVM intrinsics --- library/stdarch/crates/core_arch/src/x86/avx2.rs | 14 ++++++++------ library/stdarch/crates/core_arch/src/x86/sse2.rs | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 7ee5cee567e1..93cea66aa4e7 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -2090,7 +2090,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhw(a.as_i16x16(), b.as_i16x16())) + let a = simd_cast::<_, i32x16>(a.as_i16x16()); + let b = simd_cast::<_, i32x16>(b.as_i16x16()); + let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing @@ -2103,7 +2106,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhuw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -3672,10 +3678,6 @@ extern "C" { fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4); #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulhu.w"] - fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulh.w"] - fn pmulhw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmul.dq"] fn pmuldq(a: i32x8, b: i32x8) -> i64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index 46146aab5452..107f0b0cf206 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -261,7 +261,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhw(a.as_i16x8(), b.as_i16x8())) + let a = simd_cast::<_, i32x8>(a.as_i16x8()); + let b = simd_cast::<_, i32x8>(b.as_i16x8()); + let r = simd_shr(simd_mul(a, b), i32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`. @@ -275,7 +278,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhuw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_mul(a, b), u32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`. @@ -2838,10 +2844,6 @@ extern "C" { fn pavgw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pmadd.wd"] fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; - #[link_name = "llvm.x86.sse2.pmulh.w"] - fn pmulhw(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.sse2.pmulhu.w"] - fn pmulhuw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"]