From 387c45ae167dbb4d4cfcf7590bf98e060400e775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 23:14:09 +0200 Subject: [PATCH] Reimplement _mm_avg_epu8, _mm_avg_epu16, _mm256_avg_epu8 and _mm256_avg_epu16 without LLVM intrinsics --- library/stdarch/crates/core_arch/src/x86/avx2.rs | 14 ++++++++------ library/stdarch/crates/core_arch/src/x86/sse2.rs | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 93cea66aa4e7..80c59fe746b1 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); + transmute(simd_cast::<_, u16x16>(r)) } /// Averages packed unsigned 8-bit integers in `a` and `b`. @@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgb(a.as_u8x32(), b.as_u8x32())) + let a = simd_cast::<_, u16x32>(a.as_u8x32()); + let b = simd_cast::<_, u16x32>(b.as_u8x32()); + let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); + transmute(simd_cast::<_, u8x32>(r)) } /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`. @@ -3638,10 +3644,6 @@ extern "C" { fn pabsw(a: i16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> u32x8; - #[link_name = "llvm.x86.avx2.pavg.b"] - fn pavgb(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pavg.w"] - fn pavgw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pblendvb"] fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.phadd.w"] diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index 107f0b0cf206..22fa62ed7d3c 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgb(a.as_u8x16(), b.as_u8x16())) + let a = simd_cast::<_, u16x16>(a.as_u8x16()); + let b = simd_cast::<_, u16x16>(b.as_u8x16()); + let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1)); + transmute(simd_cast::<_, u8x16>(r)) } /// Averages packed unsigned 16-bit integers in `a` and `b`. @@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1)); + transmute(simd_cast::<_, u16x8>(r)) } /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`. @@ -2838,10 +2844,6 @@ extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.pavg.b"] - fn pavgb(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.sse2.pavg.w"] - fn pavgw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pmadd.wd"] fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.sse2.psad.bw"]