Rollup merge of #151611 - bonega:improve-is-slice-is-ascii-performance, r=folkertdev
Improve is_ascii performance on x86_64 with explicit SSE2 intrinsics # Summary Improves `slice::is_ascii` performance for SSE2 target roughly 1.5-2x on larger inputs. AVX-512 keeps similiar performance characteristics. This is building on the work already merged in rust-lang/rust#151259. In particular this PR improves the default SSE2 performance, I don't consider this a temporary fix anymore. Thanks to @folkertdev for pointing me to consider `as_chunk` again. # The implementation: - Uses 64-byte chunks with 4x 16-byte SSE2 loads OR'd together - Extracts the MSB mask with a single `pmovmskb` instruction - Falls back to usize-at-a-time SWAR for inputs < 64 bytes # Performance impact (vs before rust-lang/rust#151259): - AVX-512: 34-48x faster - SSE2: 1.5-2x faster <details> <summary>Benchmark Results (click to expand)</summary> Benchmarked on AMD Ryzen 9 9950X (AVX-512 capable). Values show relative performance (1.00 = fastest). Tops out at 139GB/s for large inputs. ### early_non_ascii | Input Size | new_avx512 | new_sse2 | old_avx512 | old_sse2 | |------------|------------|----------|------------|----------| | 64 | 1.01 | **1.00** | 13.45 | 1.13 | | 1024 | 1.01 | **1.00** | 13.53 | 1.14 | | 65536 | 1.01 | **1.00** | 13.99 | 1.12 | | 1048576 | 1.02 | **1.00** | 13.29 | 1.12 | ### late_non_ascii | Input Size | new_avx512 | new_sse2 | old_avx512 | old_sse2 | |------------|------------|----------|------------|----------| | 64 | **1.00** | 1.01 | 13.37 | 1.13 | | 1024 | 1.10 | **1.00** | 42.42 | 1.95 | | 65536 | **1.00** | 1.06 | 42.22 | 1.73 | | 1048576 | **1.00** | 1.03 | 34.73 | 1.46 | ### pure_ascii | Input Size | new_avx512 | new_sse2 | old_avx512 | old_sse2 | |------------|------------|----------|------------|----------| | 4 | 1.03 | **1.00** | 1.75 | 1.32 | | 8 | **1.00** | 1.14 | 3.89 | 2.06 | | 16 | **1.00** | 1.04 | 1.13 | 1.62 | | 32 | 1.07 | 1.19 | 5.11 | **1.00** | | 64 | **1.00** | 1.13 | 13.32 | 1.57 | | 128 | **1.00** | 1.01 | 19.97 | 1.55 | | 256 | **1.00** | 1.02 | 27.77 | 1.61 | | 1024 | **1.00** | 1.02 | 41.34 | 1.84 | | 4096 | 1.02 | **1.00** | 45.61 | 1.98 | | 16384 | 1.01 | **1.00** | 48.67 | 2.04 | | 65536 | **1.00** | 1.03 | 43.86 | 1.77 | | 262144 | **1.00** | 1.06 | 41.44 | 1.79 | | 1048576 | 1.02 | **1.00** | 35.36 | 1.44 | </details> ## Reproduction / Test Projects Standalone validation tools: https://github.com/bonega/is-ascii-fix-validation - `bench/` - Criterion benchmarks for SSE2 vs AVX-512 comparison - `fuzz/` - Compares old/new implementations with libfuzzer Relates to: https://github.com/llvm/llvm-project/issues/176906
This commit is contained in:
commit
a6e8a31b86
2 changed files with 24 additions and 42 deletions
|
|
@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool {
|
|||
)
|
||||
}
|
||||
|
||||
/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
|
||||
/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads).
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
const CHUNK_SIZE: usize = 32;
|
||||
const SSE2_CHUNK_SIZE: usize = 64;
|
||||
|
||||
/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
|
||||
/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
|
||||
///
|
||||
/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
#[inline]
|
||||
fn is_ascii_sse2(bytes: &[u8]) -> bool {
|
||||
use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
|
||||
|
||||
let mut i = 0;
|
||||
|
||||
while i + CHUNK_SIZE <= bytes.len() {
|
||||
// SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
|
||||
let ptr = unsafe { bytes.as_ptr().add(i) };
|
||||
|
||||
// Load two 16-byte chunks and combine them.
|
||||
// SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
|
||||
// `_mm_loadu_si128` allows unaligned loads.
|
||||
let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
|
||||
// SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
|
||||
let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
|
||||
|
||||
// OR them together - if any byte has the high bit set, the result will too.
|
||||
// SAFETY: SSE2 is guaranteed by the cfg predicate.
|
||||
let combined = unsafe { _mm_or_si128(chunk1, chunk2) };
|
||||
|
||||
// Create a mask from the MSBs of each byte.
|
||||
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
|
||||
// SAFETY: SSE2 is guaranteed by the cfg predicate.
|
||||
let mask = unsafe { _mm_movemask_epi8(combined) };
|
||||
let (chunks, rest) = bytes.as_chunks::<SSE2_CHUNK_SIZE>();
|
||||
|
||||
for chunk in chunks {
|
||||
let ptr = chunk.as_ptr();
|
||||
// SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64.
|
||||
let mask = unsafe {
|
||||
let a1 = _mm_loadu_si128(ptr as *const __m128i);
|
||||
let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i);
|
||||
let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i);
|
||||
let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i);
|
||||
// OR all chunks - if any byte has high bit set, combined will too.
|
||||
let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2));
|
||||
// Create a mask from the MSBs of each byte.
|
||||
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
|
||||
_mm_movemask_epi8(combined)
|
||||
};
|
||||
if mask != 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
i += CHUNK_SIZE;
|
||||
}
|
||||
|
||||
// Handle remaining bytes with simple loop
|
||||
while i < bytes.len() {
|
||||
if !bytes[i].is_ascii() {
|
||||
return false;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
true
|
||||
// Handle remaining bytes
|
||||
rest.iter().all(|b| b.is_ascii())
|
||||
}
|
||||
|
||||
/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
|
||||
|
|
@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
|
|||
is_ascii_simple(bytes)
|
||||
} else {
|
||||
// For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
|
||||
if bytes.len() < CHUNK_SIZE {
|
||||
if bytes.len() < SSE2_CHUNK_SIZE {
|
||||
let chunks = bytes.chunks_exact(USIZE_SIZE);
|
||||
let remainder = chunks.remainder();
|
||||
for chunk in chunks {
|
||||
|
|
|
|||
|
|
@ -13,15 +13,15 @@
|
|||
/// Verify `is_ascii` generates efficient code on different architectures:
|
||||
///
|
||||
/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
|
||||
/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
|
||||
/// See: https://github.com/llvm/llvm-project/issues/176906
|
||||
/// Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
|
||||
///
|
||||
/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
|
||||
/// This architecture still relies on LLVM auto-vectorization.
|
||||
|
||||
// X86_64-LABEL: test_is_ascii
|
||||
// X86_64-NOT: kshiftrd
|
||||
// X86_64-NOT: kshiftrq
|
||||
// X86_64: {{vpor|por}}
|
||||
// X86_64: {{vpmovmskb|pmovmskb}}
|
||||
|
||||
// LA64-LABEL: test_is_ascii
|
||||
// LA64: vmskltz.b
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue