From 73c3905de7b2a7cc14594080a72d4cd58391ebc7 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Thu, 18 Sep 2025 08:07:51 +0800 Subject: [PATCH] Add `is_ascii` function optimized for LoongArch64 for [u8] Similar to x86_64, on LoongArch64 we use the `vmskltz.b` instruction to test the high bit in a lane. For longer input cases, the performance improvement is significant. For unaligned cases close to 32 bytes in length, there's some regression, but it seems acceptable. | core benches (MB/s) | Before | After | % | |--------------------------------------------------------|--------|--------|---------| | ascii::is_ascii::short::case00_libcore | 1000 | 1000 | 0.00 | | ascii::is_ascii::medium::case00_libcore | 8000 | 8000 | 0.00 | | ascii::is_ascii::long::case00_libcore | 183947 | 436875 | +137.50 | | ascii::is_ascii::unaligned_head_medium::case00_libcore | 7750 | 2818 | -63.64 | | ascii::is_ascii::unaligned_head_long::case00_libcore | 317681 | 436812 | +37.50 | | ascii::is_ascii::unaligned_tail_medium::case00_libcore | 7750 | 3444 | -55.56 | | ascii::is_ascii::unaligned_tail_long::case00_libcore | 155311 | 436812 | +181.25 | | ascii::is_ascii::unaligned_both_medium::case00_libcore | 7500 | 3333 | -55.56 | | ascii::is_ascii::unaligned_both_long::case00_libcore | 174700 | 436750 | +150.00 | --- library/core/src/slice/ascii.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index e17a2e03d2dc..d02be440f5bf 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -3,7 +3,10 @@ use core::ascii::EscapeDefault; use crate::fmt::{self, Write}; -#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] +#[cfg(not(any( + all(target_arch = "x86_64", target_feature = "sse2"), + all(target_arch = "loongarch64", target_feature = "lsx") +)))] use crate::intrinsics::const_eval_select; use crate::{ascii, iter, ops}; @@ -357,7 +360,10 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool { /// /// If any of these loads produces something for which `contains_nonascii` /// (above) returns true, then we know the answer is false. -#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] +#[cfg(not(any( + all(target_arch = "x86_64", target_feature = "sse2"), + all(target_arch = "loongarch64", target_feature = "lsx") +)))] #[inline] #[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior const fn is_ascii(s: &[u8]) -> bool { @@ -455,12 +461,15 @@ const fn is_ascii(s: &[u8]) -> bool { ) } -/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64` -/// platforms. +/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the +/// `vmskltz.b` instruction on `loongarch64`. /// /// Other platforms are not likely to benefit from this code structure, so they /// use SWAR techniques to test for ASCII in `usize`-sized chunks. -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[cfg(any( + all(target_arch = "x86_64", target_feature = "sse2"), + all(target_arch = "loongarch64", target_feature = "lsx") +))] #[inline] const fn is_ascii(bytes: &[u8]) -> bool { // Process chunks of 32 bytes at a time in the fast path to enable