Rollup merge of #144938 - tgross35:more-outline-atomics, r=davidtwco

Enable `outline-atomics` by default on more AArch64 platforms

The baseline Armv8.0 ISA doesn't have atomics instructions, but in
practice most hardware is at least Armv8.1-A (2014), which includes
single-instruction atomics as part of the LSE feature. As a performance
optimization for these cases, GCC and LLVM have the `-moutline-atomics` flag
to turn atomic operations into calls to symbols like `__aarch64_cas1_acq`.
These can do runtime feature detection and use the LSE instructions if
available, falling back to more portable load-exclusive/store-exclusive
loops.

Since the recent 3b50253b57 ("compiler-builtins: plumb LSE support
for aarch64 on linux") our builtins support this LSE optimization, and
since 6936bb975a ("Dynamically enable LSE for aarch64 rust provided
intrinsics"), std will set the flag as part of its startup code. The first
commit in this PR configures this to work on all platforms built with
`outline-atomics`, not just Linux.

Thus, enable `outline-atomics` by default on Android, OpenBSD, Windows,
and Fuchsia platforms that don't have LSE in the baseline. The feature is
already enabled on Linux. Platform-specific details are included in each
commit message.

The current implementation can still be accessed by setting
`-Ctarget-feature=-outline-atomics`. Setting `-Ctarget-feature=+lse` or
a relevant CPU will use the single-instruction atomics without the call
overhead. https://rust.godbolt.org/z/dsdrzszoe

Link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/lse/intro/
Original Clang outline-atomics benchmarks: https://reviews.llvm.org/D91157#2435844

try-job: aarch64-msvc-*
try-job: arm-android
try-job: dist-android
try-job: dist-aarch64-llvm-mingw
try-job: dist-aarch64-msvc
try-job: dist-various-*
try-job: test-various
This commit is contained in:
Matthias Krüger 2025-12-09 17:36:47 +01:00 committed by GitHub
commit 76370238b0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 52 additions and 16 deletions

View file

@ -21,7 +21,7 @@ pub(crate) fn target() -> Target {
max_atomic_width: Some(128),
// As documented in https://developer.android.com/ndk/guides/cpu-features.html
// the neon (ASIMD) and FP must exist on all android aarch64 targets.
features: "+v8a,+neon".into(),
features: "+v8a,+neon,+outline-atomics".into(),
// the AAPCS64 expects use of non-leaf frame pointers per
// https://github.com/ARM-software/abi-aa/blob/4492d1570eb70c8fd146623e0db65b2d241f12e7/aapcs64/aapcs64.rst#the-frame-pointer
// and we tend to encounter interesting bugs in AArch64 unwinding code if we do not

View file

@ -3,7 +3,7 @@ use crate::spec::{Arch, Cc, FramePointer, LinkerFlavor, Lld, Target, TargetMetad
pub(crate) fn target() -> Target {
let mut base = base::windows_gnullvm::opts();
base.max_atomic_width = Some(128);
base.features = "+v8a,+neon".into();
base.features = "+v8a,+neon,+outline-atomics".into();
base.linker = Some("aarch64-w64-mingw32-clang".into());
base.add_pre_link_args(LinkerFlavor::Gnu(Cc::No, Lld::No), &["-m", "arm64pe"]);

View file

@ -3,7 +3,7 @@ use crate::spec::{Arch, FramePointer, Target, TargetMetadata, base};
pub(crate) fn target() -> Target {
let mut base = base::windows_msvc::opts();
base.max_atomic_width = Some(128);
base.features = "+v8a,+neon".into();
base.features = "+v8a,+neon,+outline-atomics".into();
// Microsoft recommends enabling frame pointers on Arm64 Windows.
// From https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#integer-registers

View file

@ -5,7 +5,7 @@ use crate::spec::{
pub(crate) fn target() -> Target {
let mut base = base::fuchsia::opts();
base.cpu = "generic".into();
base.features = "+v8a,+crc,+aes,+sha2,+neon".into();
base.features = "+v8a,+crc,+aes,+sha2,+neon,+outline-atomics".into();
base.max_atomic_width = Some(128);
base.stack_probes = StackProbeType::Inline;
base.supported_sanitizers = SanitizerSet::ADDRESS

View file

@ -13,7 +13,7 @@ pub(crate) fn target() -> Target {
data_layout: "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32".into(),
arch: Arch::AArch64,
options: TargetOptions {
features: "+v8a".into(),
features: "+v8a,+outline-atomics".into(),
max_atomic_width: Some(128),
stack_probes: StackProbeType::Inline,
..base::openbsd::opts()

View file

@ -196,7 +196,7 @@ macro_rules! compare_and_swap {
"cbnz w17, 0b",
"1:",
"ret",
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
}
}
}
@ -228,7 +228,7 @@ macro_rules! compare_and_swap_i128 {
"cbnz w15, 0b",
"1:",
"ret",
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
}
}
}
@ -256,7 +256,7 @@ macro_rules! swap {
concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
"cbnz w17, 0b",
"ret",
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
}
}
}
@ -286,7 +286,7 @@ macro_rules! fetch_op {
concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
"cbnz w15, 0b",
"ret",
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
}
}
}

View file

@ -55,8 +55,8 @@ pub mod arm;
#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
pub mod aarch64;
#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
pub mod aarch64_linux;
#[cfg(all(target_arch = "aarch64", target_feature = "outline-atomics"))]
pub mod aarch64_outline_atomics;
#[cfg(all(
kernel_user_helpers,

View file

@ -1,13 +1,49 @@
/// Hook into .init_array to enable LSE atomic operations at startup, if
/// supported.
#[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "compiler-builtins-c")))]
/// Enable LSE atomic operations at startup, if supported.
///
/// Linker sections are based on what [`ctor`] does, with priorities to run slightly before user
/// code:
///
/// - Apple uses the section `__mod_init_func`, `mod_init_funcs` is needed to set
/// `S_MOD_INIT_FUNC_POINTERS`. There doesn't seem to be a way to indicate priorities.
/// - Windows uses `.CRT$XCT`, which is run before user constructors (these should use `.CRT$XCU`).
/// - ELF uses `.init_array` with a priority of 90, which runs before our `ARGV_INIT_ARRAY`
/// initializer (priority 99). Both are within the 0-100 implementation-reserved range, per docs
/// for the [`prio-ctor-dtor`] warning, and this matches compiler-rt's `CONSTRUCTOR_PRIORITY`.
///
/// To save startup time, the initializer is only run if outline atomic routines from
/// compiler-builtins may be used. If LSE is known to be available then the calls are never
/// emitted, and if we build the C intrinsics then it has its own initializer using the symbol
/// `__aarch64_have_lse_atomics`.
///
/// Initialization is done in a global constructor to so we get the same behavior regardless of
/// whether Rust's `init` is used, or if we are in a `dylib` or `no_main` situation (as opposed
/// to doing it as part of pre-main startup). This also matches C implementations.
///
/// Ideally `core` would have something similar, but detecting the CPU features requires the
/// auxiliary vector from the OS. We do the initialization in `std` rather than as part of
/// `compiler-builtins` because a builtins->std dependency isn't possible, and inlining parts of
/// `std-detect` would be much messier.
///
/// [`ctor`]: https://github.com/mmastrac/rust-ctor/blob/63382b833ddcbfb8b064f4e86bfa1ed4026ff356/shared/src/macros/mod.rs#L522-L534
/// [`prio-ctor-dtor`]: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
#[cfg(all(
target_arch = "aarch64",
target_feature = "outline-atomics",
not(target_feature = "lse"),
not(feature = "compiler-builtins-c"),
))]
#[used]
#[unsafe(link_section = ".init_array.90")]
#[cfg_attr(target_vendor = "apple", unsafe(link_section = "__DATA,__mod_init_func,mod_init_funcs"))]
#[cfg_attr(target_os = "windows", unsafe(link_section = ".CRT$XCT"))]
#[cfg_attr(
not(any(target_vendor = "apple", target_os = "windows")),
unsafe(link_section = ".init_array.90")
)]
static RUST_LSE_INIT: extern "C" fn() = {
extern "C" fn init_lse() {
use crate::arch;
// This is provided by compiler-builtins::aarch64_linux.
// This is provided by compiler-builtins::aarch64_outline_atomics.
unsafe extern "C" {
fn __rust_enable_lse();
}