diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 459c826f4064..ae641871279b 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool { ) } -/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). +/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads). #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] -const CHUNK_SIZE: usize = 32; +const SSE2_CHUNK_SIZE: usize = 64; -/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to -/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. -/// -/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code. #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] fn is_ascii_sse2(bytes: &[u8]) -> bool { use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; - let mut i = 0; - - while i + CHUNK_SIZE <= bytes.len() { - // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. - let ptr = unsafe { bytes.as_ptr().add(i) }; - - // Load two 16-byte chunks and combine them. - // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. - // `_mm_loadu_si128` allows unaligned loads. - let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; - // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. - let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; - - // OR them together - if any byte has the high bit set, the result will too. - // SAFETY: SSE2 is guaranteed by the cfg predicate. - let combined = unsafe { _mm_or_si128(chunk1, chunk2) }; - - // Create a mask from the MSBs of each byte. - // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. - // SAFETY: SSE2 is guaranteed by the cfg predicate. - let mask = unsafe { _mm_movemask_epi8(combined) }; + let (chunks, rest) = bytes.as_chunks::(); + for chunk in chunks { + let ptr = chunk.as_ptr(); + // SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64. + let mask = unsafe { + let a1 = _mm_loadu_si128(ptr as *const __m128i); + let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i); + let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i); + let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i); + // OR all chunks - if any byte has high bit set, combined will too. + let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2)); + // Create a mask from the MSBs of each byte. + // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. + _mm_movemask_epi8(combined) + }; if mask != 0 { return false; } - - i += CHUNK_SIZE; } - // Handle remaining bytes with simple loop - while i < bytes.len() { - if !bytes[i].is_ascii() { - return false; - } - i += 1; - } - - true + // Handle remaining bytes + rest.iter().all(|b| b.is_ascii()) } /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. @@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { is_ascii_simple(bytes) } else { // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. - if bytes.len() < CHUNK_SIZE { + if bytes.len() < SSE2_CHUNK_SIZE { let chunks = bytes.chunks_exact(USIZE_SIZE); let remainder = chunks.remainder(); for chunk in chunks { diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs index d01b321bf460..b9a520505498 100644 --- a/tests/assembly-llvm/slice-is-ascii.rs +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -13,15 +13,15 @@ /// Verify `is_ascii` generates efficient code on different architectures: /// /// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). -/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). -/// See: https://github.com/llvm/llvm-project/issues/176906 +/// Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). /// /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. -/// This architecture still relies on LLVM auto-vectorization. // X86_64-LABEL: test_is_ascii // X86_64-NOT: kshiftrd // X86_64-NOT: kshiftrq +// X86_64: {{vpor|por}} +// X86_64: {{vpmovmskb|pmovmskb}} // LA64-LABEL: test_is_ascii // LA64: vmskltz.b