From 507391d2c8306f1d3c0ac787a69c03ede11c6661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 22:48:44 +0200 Subject: [PATCH] Reimplement _mm_mul_epu32 and _mm256_mul_epu32 without LLVM intrinsics --- library/stdarch/crates/core_arch/src/x86/avx2.rs | 7 ++++--- library/stdarch/crates/core_arch/src/x86/sse2.rs | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index e23c795ee757..7ee5cee567e1 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -2074,7 +2074,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuludq(a.as_u32x8(), b.as_u32x8())) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let mask = u64x4::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -3675,8 +3678,6 @@ extern "C" { fn pmulhw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmul.dq"] fn pmuldq(a: i32x8, b: i32x8) -> i64x4; - #[link_name = "llvm.x86.avx2.pmulu.dq"] - fn pmuludq(a: u32x8, b: u32x8) -> u64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index 8dbad60eed64..46146aab5452 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -303,7 +303,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuludq(a.as_u32x4(), b.as_u32x4())) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let mask = u64x2::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Sum the absolute differences of packed unsigned 8-bit integers. @@ -2839,8 +2842,6 @@ extern "C" { fn pmulhw(a: i16x8, b: i16x8) -> i16x8; #[link_name = "llvm.x86.sse2.pmulhu.w"] fn pmulhuw(a: u16x8, b: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse2.pmulu.dq"] - fn pmuludq(a: u32x4, b: u32x4) -> u64x2; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"]