Rollup merge of #147436 - okaneco:eq_ignore_ascii_autovec, r=scottmcm
slice/ascii: Optimize `eq_ignore_ascii_case` with auto-vectorization
- Refactor the current functionality into a helper function
- Use `as_chunks` to encourage auto-vectorization in the optimized chunk processing function
- Add a codegen test checking for vectorization and no panicking
- Add benches for `eq_ignore_ascii_case`
---
The optimized function is initially only enabled for x86_64 which has `sse2` as part of its baseline, but none of the code is platform specific. Other platforms with SIMD instructions may also benefit from this implementation.
Performance improvements only manifest for slices of 16 bytes or longer, so the optimized path is gated behind a length check for greater than or equal to 16.
Benchmarks - Cases below 16 bytes are unaffected, cases above all show sizeable improvements.
```
before:
str::eq_ignore_ascii_case::bench_large_str_eq 4942.30ns/iter +/- 48.20
str::eq_ignore_ascii_case::bench_medium_str_eq 632.01ns/iter +/- 16.87
str::eq_ignore_ascii_case::bench_str_17_bytes_eq 16.28ns/iter +/- 0.45
str::eq_ignore_ascii_case::bench_str_31_bytes_eq 35.23ns/iter +/- 2.28
str::eq_ignore_ascii_case::bench_str_of_8_bytes_eq 7.56ns/iter +/- 0.22
str::eq_ignore_ascii_case::bench_str_under_8_bytes_eq 2.64ns/iter +/- 0.06
after:
str::eq_ignore_ascii_case::bench_large_str_eq 611.63ns/iter +/- 28.29
str::eq_ignore_ascii_case::bench_medium_str_eq 77.10ns/iter +/- 19.76
str::eq_ignore_ascii_case::bench_str_17_bytes_eq 3.49ns/iter +/- 0.39
str::eq_ignore_ascii_case::bench_str_31_bytes_eq 3.50ns/iter +/- 0.27
str::eq_ignore_ascii_case::bench_str_of_8_bytes_eq 7.27ns/iter +/- 0.09
str::eq_ignore_ascii_case::bench_str_under_8_bytes_eq 2.60ns/iter +/- 0.05
```
This commit is contained in:
commit
1c892e829c
4 changed files with 140 additions and 0 deletions
|
|
@ -62,6 +62,25 @@ impl [u8] {
|
|||
return false;
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
{
|
||||
const CHUNK_SIZE: usize = 16;
|
||||
// The following function has two invariants:
|
||||
// 1. The slice lengths must be equal, which we checked above.
|
||||
// 2. The slice lengths must greater than or equal to N, which this
|
||||
// if-statement is checking.
|
||||
if self.len() >= CHUNK_SIZE {
|
||||
return self.eq_ignore_ascii_case_chunks::<CHUNK_SIZE>(other);
|
||||
}
|
||||
}
|
||||
|
||||
self.eq_ignore_ascii_case_simple(other)
|
||||
}
|
||||
|
||||
/// ASCII case-insensitive equality check without chunk-at-a-time
|
||||
/// optimization.
|
||||
#[inline]
|
||||
const fn eq_ignore_ascii_case_simple(&self, other: &[u8]) -> bool {
|
||||
// FIXME(const-hack): This implementation can be reverted when
|
||||
// `core::iter::zip` is allowed in const. The original implementation:
|
||||
// self.len() == other.len() && iter::zip(self, other).all(|(a, b)| a.eq_ignore_ascii_case(b))
|
||||
|
|
@ -80,6 +99,65 @@ impl [u8] {
|
|||
true
|
||||
}
|
||||
|
||||
/// Optimized version of `eq_ignore_ascii_case` to process chunks at a time.
|
||||
///
|
||||
/// Platforms that have SIMD instructions may benefit from this
|
||||
/// implementation over `eq_ignore_ascii_case_simple`.
|
||||
///
|
||||
/// # Invariants
|
||||
///
|
||||
/// The caller must guarantee that the slices are equal in length, and the
|
||||
/// slice lengths are greater than or equal to `N` bytes.
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
#[inline]
|
||||
const fn eq_ignore_ascii_case_chunks<const N: usize>(&self, other: &[u8]) -> bool {
|
||||
// FIXME(const-hack): The while-loops that follow should be replaced by
|
||||
// for-loops when available in const.
|
||||
|
||||
let (self_chunks, self_rem) = self.as_chunks::<N>();
|
||||
let (other_chunks, _) = other.as_chunks::<N>();
|
||||
|
||||
// Branchless check to encourage auto-vectorization
|
||||
#[inline(always)]
|
||||
const fn eq_ignore_ascii_inner<const L: usize>(lhs: &[u8; L], rhs: &[u8; L]) -> bool {
|
||||
let mut equal_ascii = true;
|
||||
let mut j = 0;
|
||||
while j < L {
|
||||
equal_ascii &= lhs[j].eq_ignore_ascii_case(&rhs[j]);
|
||||
j += 1;
|
||||
}
|
||||
|
||||
equal_ascii
|
||||
}
|
||||
|
||||
// Process the chunks, returning early if an inequality is found
|
||||
let mut i = 0;
|
||||
while i < self_chunks.len() && i < other_chunks.len() {
|
||||
if !eq_ignore_ascii_inner(&self_chunks[i], &other_chunks[i]) {
|
||||
return false;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// Check the length invariant which is necessary for the tail-handling
|
||||
// logic to be correct. This should have been upheld by the caller,
|
||||
// otherwise lengths less than N will compare as true without any
|
||||
// checking.
|
||||
debug_assert!(self.len() >= N);
|
||||
|
||||
// If there are remaining tails, load the last N bytes in the slices to
|
||||
// avoid falling back to per-byte checking.
|
||||
if !self_rem.is_empty() {
|
||||
if let (Some(a_rem), Some(b_rem)) = (self.last_chunk::<N>(), other.last_chunk::<N>()) {
|
||||
if !eq_ignore_ascii_inner(a_rem, b_rem) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
/// Converts this slice to its ASCII upper case equivalent in-place.
|
||||
///
|
||||
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ use test::{Bencher, black_box};
|
|||
mod char_count;
|
||||
mod corpora;
|
||||
mod debug;
|
||||
mod eq_ignore_ascii_case;
|
||||
mod iter;
|
||||
|
||||
#[bench]
|
||||
|
|
|
|||
45
library/coretests/benches/str/eq_ignore_ascii_case.rs
Normal file
45
library/coretests/benches/str/eq_ignore_ascii_case.rs
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
use test::{Bencher, black_box};
|
||||
|
||||
use super::corpora::*;
|
||||
|
||||
#[bench]
|
||||
fn bench_str_under_8_bytes_eq(b: &mut Bencher) {
|
||||
let s = black_box("foo");
|
||||
let other = black_box("foo");
|
||||
b.iter(|| assert!(s.eq_ignore_ascii_case(other)))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_str_of_8_bytes_eq(b: &mut Bencher) {
|
||||
let s = black_box(en::TINY);
|
||||
let other = black_box(en::TINY);
|
||||
b.iter(|| assert!(s.eq_ignore_ascii_case(other)))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_str_17_bytes_eq(b: &mut Bencher) {
|
||||
let s = black_box(&en::SMALL[..17]);
|
||||
let other = black_box(&en::SMALL[..17]);
|
||||
b.iter(|| assert!(s.eq_ignore_ascii_case(other)))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_str_31_bytes_eq(b: &mut Bencher) {
|
||||
let s = black_box(&en::SMALL[..31]);
|
||||
let other = black_box(&en::SMALL[..31]);
|
||||
b.iter(|| assert!(s.eq_ignore_ascii_case(other)))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_medium_str_eq(b: &mut Bencher) {
|
||||
let s = black_box(en::MEDIUM);
|
||||
let other = black_box(en::MEDIUM);
|
||||
b.iter(|| assert!(s.eq_ignore_ascii_case(other)))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_large_str_eq(b: &mut Bencher) {
|
||||
let s = black_box(en::LARGE);
|
||||
let other = black_box(en::LARGE);
|
||||
b.iter(|| assert!(s.eq_ignore_ascii_case(other)))
|
||||
}
|
||||
16
tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs
Normal file
16
tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
//@ compile-flags: -Copt-level=3
|
||||
//@ only-x86_64
|
||||
#![crate_type = "lib"]
|
||||
|
||||
// Ensure that the optimized variant of the function gets auto-vectorized and
|
||||
// that the inner helper function is inlined.
|
||||
// CHECK-LABEL: @eq_ignore_ascii_case_autovectorized
|
||||
#[no_mangle]
|
||||
pub fn eq_ignore_ascii_case_autovectorized(s: &str, other: &str) -> bool {
|
||||
// CHECK: load <16 x i8>
|
||||
// CHECK: load <16 x i8>
|
||||
// CHECK: bitcast <16 x i1>
|
||||
// CHECK-NOT: call {{.*}}eq_ignore_ascii_inner
|
||||
// CHECK-NOT: panic
|
||||
s.eq_ignore_ascii_case(other)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue