Add is_ascii function optimized for x86-64 for [u8]
The new `is_ascii` function is optimized to use the `pmovmskb` vector instruction which tests the high bit in a lane. This corresponds to the same check of whether a byte is ASCII so ASCII validity checking can be vectorized. This instruction does not exist on other platforms so it is likely to regress performance and is gated to all(target_arch = "x86_64", target_feature = "sse2"). Add codegen test Remove crate::mem import for functions included in the prelude
This commit is contained in:
parent
d7d67ad14b
commit
1b5c02b757
3 changed files with 85 additions and 21 deletions
|
|
@ -54,27 +54,29 @@ benches! {
|
|||
}
|
||||
|
||||
fn case04_while_loop(bytes: &[u8]) {
|
||||
// Constant chosen to enable `pmovmskb` instruction on x86-64
|
||||
const N: usize = 32;
|
||||
// Process chunks of 32 bytes at a time in the fast path to enable
|
||||
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
|
||||
// can be OR'd together and then the resulting vector can be tested for
|
||||
// non-ASCII bytes.
|
||||
const CHUNK_SIZE: usize = 32;
|
||||
|
||||
let mut i = 0;
|
||||
|
||||
while i + N <= bytes.len() {
|
||||
let chunk_end = i + N;
|
||||
while i + CHUNK_SIZE <= bytes.len() {
|
||||
let chunk_end = i + CHUNK_SIZE;
|
||||
|
||||
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
|
||||
// creates a mask from the most significant bit of each byte.
|
||||
// ASCII bytes are less than 128 (0x80), so their most significant
|
||||
// bit is unset. Thus, detecting non-ASCII bytes can be done in one
|
||||
// instruction.
|
||||
// bit is unset.
|
||||
let mut count = 0;
|
||||
while i < chunk_end {
|
||||
count += (bytes[i] <= 127) as u8;
|
||||
count += bytes[i].is_ascii() as u8;
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// All bytes should be <= 127 so count is equal to chunk size.
|
||||
if count != N as u8 {
|
||||
if count != CHUNK_SIZE as u8 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -82,7 +84,7 @@ benches! {
|
|||
// Process the remaining `bytes.len() % N` bytes.
|
||||
let mut is_ascii = true;
|
||||
while i < bytes.len() {
|
||||
is_ascii &= bytes[i] <= 127;
|
||||
is_ascii &= bytes[i].is_ascii();
|
||||
i += 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,9 @@
|
|||
use core::ascii::EscapeDefault;
|
||||
|
||||
use crate::fmt::{self, Write};
|
||||
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
|
||||
use crate::intrinsics::const_eval_select;
|
||||
use crate::{ascii, iter, mem, ops};
|
||||
use crate::{ascii, iter, ops};
|
||||
|
||||
#[cfg(not(test))]
|
||||
impl [u8] {
|
||||
|
|
@ -308,14 +309,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
|
||||
/// from `../str/mod.rs`, which does something similar for utf8 validation.
|
||||
#[inline]
|
||||
const fn contains_nonascii(v: usize) -> bool {
|
||||
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
|
||||
(NONASCII_MASK & v) != 0
|
||||
}
|
||||
|
||||
/// ASCII test *without* the chunk-at-a-time optimizations.
|
||||
///
|
||||
/// This is carefully structured to produce nice small code -- it's smaller in
|
||||
|
|
@ -346,6 +339,7 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
|
|||
///
|
||||
/// If any of these loads produces something for which `contains_nonascii`
|
||||
/// (above) returns true, then we know the answer is false.
|
||||
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
|
||||
#[inline]
|
||||
#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
|
||||
const fn is_ascii(s: &[u8]) -> bool {
|
||||
|
|
@ -356,7 +350,14 @@ const fn is_ascii(s: &[u8]) -> bool {
|
|||
if const {
|
||||
is_ascii_simple(s)
|
||||
} else {
|
||||
const USIZE_SIZE: usize = mem::size_of::<usize>();
|
||||
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
|
||||
/// from `../str/mod.rs`, which does something similar for utf8 validation.
|
||||
const fn contains_nonascii(v: usize) -> bool {
|
||||
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
|
||||
(NONASCII_MASK & v) != 0
|
||||
}
|
||||
|
||||
const USIZE_SIZE: usize = size_of::<usize>();
|
||||
|
||||
let len = s.len();
|
||||
let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
|
||||
|
|
@ -366,7 +367,7 @@ const fn is_ascii(s: &[u8]) -> bool {
|
|||
//
|
||||
// We also do this for architectures where `size_of::<usize>()` isn't
|
||||
// sufficient alignment for `usize`, because it's a weird edge case.
|
||||
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
|
||||
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
|
||||
return is_ascii_simple(s);
|
||||
}
|
||||
|
||||
|
|
@ -400,7 +401,7 @@ const fn is_ascii(s: &[u8]) -> bool {
|
|||
// have alignment information it should have given a `usize::MAX` for
|
||||
// `align_offset` earlier, sending things through the scalar path instead of
|
||||
// this one, so this check should pass if it's reachable.
|
||||
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
|
||||
debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
|
||||
|
||||
// Read subsequent words until the last aligned word, excluding the last
|
||||
// aligned word by itself to be done in tail check later, to ensure that
|
||||
|
|
@ -435,3 +436,48 @@ const fn is_ascii(s: &[u8]) -> bool {
|
|||
}
|
||||
)
|
||||
}
|
||||
|
||||
/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
|
||||
/// platforms.
|
||||
///
|
||||
/// Other platforms are not likely to benefit from this code structure, so they
|
||||
/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
#[inline]
|
||||
const fn is_ascii(bytes: &[u8]) -> bool {
|
||||
// Process chunks of 32 bytes at a time in the fast path to enable
|
||||
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
|
||||
// can be OR'd together and then the resulting vector can be tested for
|
||||
// non-ASCII bytes.
|
||||
const CHUNK_SIZE: usize = 32;
|
||||
|
||||
let mut i = 0;
|
||||
|
||||
while i + CHUNK_SIZE <= bytes.len() {
|
||||
let chunk_end = i + CHUNK_SIZE;
|
||||
|
||||
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
|
||||
// creates a mask from the most significant bit of each byte.
|
||||
// ASCII bytes are less than 128 (0x80), so their most significant
|
||||
// bit is unset.
|
||||
let mut count = 0;
|
||||
while i < chunk_end {
|
||||
count += bytes[i].is_ascii() as u8;
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// All bytes should be <= 127 so count is equal to chunk size.
|
||||
if count != CHUNK_SIZE as u8 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Process the remaining `bytes.len() % N` bytes.
|
||||
let mut is_ascii = true;
|
||||
while i < bytes.len() {
|
||||
is_ascii &= bytes[i].is_ascii();
|
||||
i += 1;
|
||||
}
|
||||
|
||||
is_ascii
|
||||
}
|
||||
|
|
|
|||
16
tests/codegen/slice-is-ascii.rs
Normal file
16
tests/codegen/slice-is-ascii.rs
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
//@ only-x86_64
|
||||
//@ compile-flags: -C opt-level=3
|
||||
#![crate_type = "lib"]
|
||||
|
||||
/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
|
||||
/// Platforms lacking an equivalent instruction use other techniques for
|
||||
/// optimizing `is_ascii`.
|
||||
// CHECK-LABEL: @is_ascii_autovectorized
|
||||
#[no_mangle]
|
||||
pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
|
||||
// CHECK: load <32 x i8>
|
||||
// CHECK-NEXT: icmp slt <32 x i8>
|
||||
// CHECK-NEXT: bitcast <32 x i1>
|
||||
// CHECK-NEXT: icmp eq i32
|
||||
s.is_ascii()
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue