rust-lang/portable-simd#443: Add armv7 neon mplementation for Simd<u8, 16>::swizzle_dyn

Use arm neon intrinsics to swizzle two u8x8 blocks with a u8x8x2 lookup table.
This commit is contained in:
Hans Kratz 2024-10-23 00:42:29 +02:00 committed by GitHub
parent 158e2409fe
commit 7e162d19dd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -57,6 +57,13 @@ where
target_endian = "little"
))]
16 => transize(vqtbl1q_u8, self, idxs),
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
32 => transize(avx2_pshufb, self, idxs),
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
@ -98,6 +105,28 @@ where
}
}
/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
/// with a u8x8x2 lookup table.
///
/// # Safety
/// This requires armv7 neon to work
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
// SAFETY: Caller promised arm neon support
unsafe {
let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
vcombine_u8(lo, hi).into()
}
}
/// "vpshufb like it was meant to be" on AVX2
///
/// # Safety