diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs index ece026a448b7..35c659b7a429 100644 --- a/crates/core_simd/src/mod.rs +++ b/crates/core_simd/src/mod.rs @@ -17,6 +17,7 @@ mod masks; mod ops; mod ord; mod select; +mod swizzle_dyn; mod vector; mod vendor; @@ -32,5 +33,6 @@ pub mod simd { pub use crate::core_simd::masks::*; pub use crate::core_simd::ord::*; pub use crate::core_simd::swizzle::*; + pub use crate::core_simd::swizzle_dyn::*; pub use crate::core_simd::vector::*; } diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs new file mode 100644 index 000000000000..5c3a2c1824ff --- /dev/null +++ b/crates/core_simd/src/swizzle_dyn.rs @@ -0,0 +1,155 @@ +use crate::simd::{LaneCount, Simd, SupportedLaneCount}; +use core::mem; + +impl Simd +where + LaneCount: SupportedLaneCount, +{ + /// Swizzle a vector of bytes according to the index vector. + /// Indices within range select the appropriate byte. + /// Indices "out of bounds" instead select 0. + /// + /// Note that the current implementation is selected during build-time + /// of the standard library, so `cargo build -Zbuild-std` may be necessary + /// to unlock better performance, especially for larger vectors. + /// A planned compiler improvement will enable using `#[target_feature]` instead. + #[inline] + pub fn swizzle_dyn(self, idxs: Simd) -> Self { + #![allow(unused_imports, unused_unsafe)] + #[cfg(target_arch = "aarch64")] + use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8}; + #[cfg(all(target_arch = "arm", target_feature = "v7"))] + use core::arch::arm::{uint8x8_t, vtbl1_u8}; + #[cfg(target_arch = "wasm32")] + use core::arch::wasm32 as wasm; + #[cfg(target_arch = "x86")] + use core::arch::x86; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64 as x86; + // SAFETY: Intrinsics covered by cfg + unsafe { + match N { + #[cfg(target_feature = "neon")] + 8 => transize(vtbl1_u8, self, idxs), + #[cfg(target_feature = "ssse3")] + 16 => transize(x86::_mm_shuffle_epi8, self, idxs), + #[cfg(target_feature = "simd128")] + 16 => transize(wasm::i8x16_swizzle, self, idxs), + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + 16 => transize(vqtbl1q_u8, self, idxs), + #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))] + 32 => transize_raw(avx2_pshufb, self, idxs), + #[cfg(target_feature = "avx512vl,avx512vbmi")] + 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs), + // Notable absence: avx512bw shuffle + // If avx512bw is available, odds of avx512vbmi are good + #[cfg(target_feature = "avx512vbmi")] + 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs), + _ => { + let mut array = [0; N]; + for (i, k) in idxs.to_array().into_iter().enumerate() { + if (k as usize) < N { + array[i] = self[k as usize]; + }; + } + array.into() + } + } + } + } +} + +/// "vpshufb like it was meant to be" on AVX2 +/// +/// # Safety +/// This requires AVX2 to work +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "avx2")] +#[allow(unused)] +#[inline] +unsafe fn avx2_pshufb(bytes: Simd, idxs: Simd) -> Simd { + use crate::simd::SimdPartialOrd; + #[cfg(target_arch = "x86")] + use core::arch::x86; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64 as x86; + use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle; + use x86::_mm256_shuffle_epi8 as avx2_half_pshufb; + let mid = Simd::splat(16u8); + let high = mid + mid; + // SAFETY: Caller promised AVX2 + unsafe { + // This is ordering sensitive, and LLVM will order these how you put them. + // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes. + // But the "compose" step will lower to ops that can also use at least 1 other port. + // So this tries to break up permutes so composition flows through "open" ports. + // Comparative benches should be done on multiple AVX2 CPUs before reordering this + + let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into()); + let hi_shuf = Simd::from(avx2_half_pshufb( + hihi, // duplicate the vector's top half + idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31 + )); + // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics + let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0)); + let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into()); + let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into())); + // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step + let compose = idxs.simd_lt(mid).select(lo_shuf, compose); + compose + } +} + +/// This sets up a call to an architecture-specific function, and in doing so +/// it persuades rustc that everything is the correct size. Which it is. +/// This would not be needed if one could convince Rust that, by matching on N, +/// N is that value, and thus it would be valid to substitute e.g. 16. +/// +/// # Safety +/// The correctness of this function hinges on the sizes agreeing in actuality. +#[allow(dead_code)] +#[inline(always)] +unsafe fn transize( + f: unsafe fn(T, T) -> T, + bytes: Simd, + idxs: Simd, +) -> Simd +where + LaneCount: SupportedLaneCount, +{ + let idxs = zeroing_idxs(idxs); + // SAFETY: Same obligation to use this function as to use mem::transmute_copy. + unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } +} + +/// Make indices that yield 0 for this architecture +#[inline(always)] +fn zeroing_idxs(idxs: Simd) -> Simd +where + LaneCount: SupportedLaneCount, +{ + // On x86, make sure the top bit is set. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let idxs = { + use crate::simd::SimdPartialOrd; + idxs.simd_lt(Simd::splat(N as u8)) + .select(idxs, Simd::splat(u8::MAX)) + }; + // Simply do nothing on most architectures. + idxs +} + +/// As transize but no implicit call to `zeroing_idxs`. +#[allow(dead_code)] +#[inline(always)] +unsafe fn transize_raw( + f: unsafe fn(T, T) -> T, + bytes: Simd, + idxs: Simd, +) -> Simd +where + LaneCount: SupportedLaneCount, +{ + // SAFETY: Same obligation to use this function as to use mem::transmute_copy. + unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } +} diff --git a/crates/core_simd/tests/swizzle_dyn.rs b/crates/core_simd/tests/swizzle_dyn.rs new file mode 100644 index 000000000000..646cd5f33833 --- /dev/null +++ b/crates/core_simd/tests/swizzle_dyn.rs @@ -0,0 +1,74 @@ +#![feature(portable_simd)] +use core::{fmt, ops::RangeInclusive}; +use proptest; +use test_helpers::{self, biteq, make_runner, prop_assert_biteq}; + +fn swizzle_dyn_scalar_ver(values: [u8; N], idxs: [u8; N]) -> [u8; N] { + let mut array = [0; N]; + for (i, k) in idxs.into_iter().enumerate() { + if (k as usize) < N { + array[i] = values[k as usize]; + }; + } + array +} + +test_helpers::test_lanes! { + fn swizzle_dyn() { + match_simd_with_fallback( + &core_simd::simd::Simd::::swizzle_dyn, + &swizzle_dyn_scalar_ver, + &|_, _| true, + ); + } +} + +fn match_simd_with_fallback( + fv: &dyn Fn(Vector, Vector) -> VectorResult, + fs: &dyn Fn([Scalar; N], [Scalar; N]) -> [ScalarResult; N], + check: &dyn Fn([Scalar; N], [Scalar; N]) -> bool, +) where + Scalar: Copy + fmt::Debug + SwizzleStrategy, + ScalarResult: Copy + biteq::BitEq + fmt::Debug + SwizzleStrategy, + Vector: Into<[Scalar; N]> + From<[Scalar; N]> + Copy, + VectorResult: Into<[ScalarResult; N]> + From<[ScalarResult; N]> + Copy, +{ + test_swizzles_2(&|x: [Scalar; N], y: [Scalar; N]| { + proptest::prop_assume!(check(x, y)); + let result_v: [ScalarResult; N] = fv(x.into(), y.into()).into(); + let result_s: [ScalarResult; N] = fs(x, y); + crate::prop_assert_biteq!(result_v, result_s); + Ok(()) + }); +} + +fn test_swizzles_2( + f: &dyn Fn(A, B) -> proptest::test_runner::TestCaseResult, +) { + let mut runner = make_runner(); + runner + .run( + &(A::swizzled_strategy(), B::swizzled_strategy()), + |(a, b)| f(a, b), + ) + .unwrap(); +} + +pub trait SwizzleStrategy { + type Strategy: proptest::strategy::Strategy; + fn swizzled_strategy() -> Self::Strategy; +} + +impl SwizzleStrategy for u8 { + type Strategy = RangeInclusive; + fn swizzled_strategy() -> Self::Strategy { + 0..=64 + } +} + +impl SwizzleStrategy for [T; N] { + type Strategy = test_helpers::array::UniformArrayStrategy; + fn swizzled_strategy() -> Self::Strategy { + Self::Strategy::new(T::swizzled_strategy()) + } +}