Rollup merge of #144938 - tgross35:more-outline-atomics, r=davidtwco
Enable `outline-atomics` by default on more AArch64 platforms The baseline Armv8.0 ISA doesn't have atomics instructions, but in practice most hardware is at least Armv8.1-A (2014), which includes single-instruction atomics as part of the LSE feature. As a performance optimization for these cases, GCC and LLVM have the `-moutline-atomics` flag to turn atomic operations into calls to symbols like `__aarch64_cas1_acq`. These can do runtime feature detection and use the LSE instructions if available, falling back to more portable load-exclusive/store-exclusive loops. Since the recent3b50253b57("compiler-builtins: plumb LSE support for aarch64 on linux") our builtins support this LSE optimization, and since6936bb975a("Dynamically enable LSE for aarch64 rust provided intrinsics"), std will set the flag as part of its startup code. The first commit in this PR configures this to work on all platforms built with `outline-atomics`, not just Linux. Thus, enable `outline-atomics` by default on Android, OpenBSD, Windows, and Fuchsia platforms that don't have LSE in the baseline. The feature is already enabled on Linux. Platform-specific details are included in each commit message. The current implementation can still be accessed by setting `-Ctarget-feature=-outline-atomics`. Setting `-Ctarget-feature=+lse` or a relevant CPU will use the single-instruction atomics without the call overhead. https://rust.godbolt.org/z/dsdrzszoe Link: https://learn.arm.com/learning-paths/servers-and-cloud-computing/lse/intro/ Original Clang outline-atomics benchmarks: https://reviews.llvm.org/D91157#2435844 try-job: aarch64-msvc-* try-job: arm-android try-job: dist-android try-job: dist-aarch64-llvm-mingw try-job: dist-aarch64-msvc try-job: dist-various-* try-job: test-various
This commit is contained in:
commit
76370238b0
8 changed files with 52 additions and 16 deletions
|
|
@ -21,7 +21,7 @@ pub(crate) fn target() -> Target {
|
|||
max_atomic_width: Some(128),
|
||||
// As documented in https://developer.android.com/ndk/guides/cpu-features.html
|
||||
// the neon (ASIMD) and FP must exist on all android aarch64 targets.
|
||||
features: "+v8a,+neon".into(),
|
||||
features: "+v8a,+neon,+outline-atomics".into(),
|
||||
// the AAPCS64 expects use of non-leaf frame pointers per
|
||||
// https://github.com/ARM-software/abi-aa/blob/4492d1570eb70c8fd146623e0db65b2d241f12e7/aapcs64/aapcs64.rst#the-frame-pointer
|
||||
// and we tend to encounter interesting bugs in AArch64 unwinding code if we do not
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ use crate::spec::{Arch, Cc, FramePointer, LinkerFlavor, Lld, Target, TargetMetad
|
|||
pub(crate) fn target() -> Target {
|
||||
let mut base = base::windows_gnullvm::opts();
|
||||
base.max_atomic_width = Some(128);
|
||||
base.features = "+v8a,+neon".into();
|
||||
base.features = "+v8a,+neon,+outline-atomics".into();
|
||||
base.linker = Some("aarch64-w64-mingw32-clang".into());
|
||||
base.add_pre_link_args(LinkerFlavor::Gnu(Cc::No, Lld::No), &["-m", "arm64pe"]);
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ use crate::spec::{Arch, FramePointer, Target, TargetMetadata, base};
|
|||
pub(crate) fn target() -> Target {
|
||||
let mut base = base::windows_msvc::opts();
|
||||
base.max_atomic_width = Some(128);
|
||||
base.features = "+v8a,+neon".into();
|
||||
base.features = "+v8a,+neon,+outline-atomics".into();
|
||||
|
||||
// Microsoft recommends enabling frame pointers on Arm64 Windows.
|
||||
// From https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#integer-registers
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use crate::spec::{
|
|||
pub(crate) fn target() -> Target {
|
||||
let mut base = base::fuchsia::opts();
|
||||
base.cpu = "generic".into();
|
||||
base.features = "+v8a,+crc,+aes,+sha2,+neon".into();
|
||||
base.features = "+v8a,+crc,+aes,+sha2,+neon,+outline-atomics".into();
|
||||
base.max_atomic_width = Some(128);
|
||||
base.stack_probes = StackProbeType::Inline;
|
||||
base.supported_sanitizers = SanitizerSet::ADDRESS
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ pub(crate) fn target() -> Target {
|
|||
data_layout: "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32".into(),
|
||||
arch: Arch::AArch64,
|
||||
options: TargetOptions {
|
||||
features: "+v8a".into(),
|
||||
features: "+v8a,+outline-atomics".into(),
|
||||
max_atomic_width: Some(128),
|
||||
stack_probes: StackProbeType::Inline,
|
||||
..base::openbsd::opts()
|
||||
|
|
|
|||
|
|
@ -196,7 +196,7 @@ macro_rules! compare_and_swap {
|
|||
"cbnz w17, 0b",
|
||||
"1:",
|
||||
"ret",
|
||||
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
|
||||
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -228,7 +228,7 @@ macro_rules! compare_and_swap_i128 {
|
|||
"cbnz w15, 0b",
|
||||
"1:",
|
||||
"ret",
|
||||
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
|
||||
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -256,7 +256,7 @@ macro_rules! swap {
|
|||
concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
|
||||
"cbnz w17, 0b",
|
||||
"ret",
|
||||
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
|
||||
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -286,7 +286,7 @@ macro_rules! fetch_op {
|
|||
concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
|
||||
"cbnz w15, 0b",
|
||||
"ret",
|
||||
have_lse = sym crate::aarch64_linux::HAVE_LSE_ATOMICS,
|
||||
have_lse = sym crate::aarch64_outline_atomics::HAVE_LSE_ATOMICS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -55,8 +55,8 @@ pub mod arm;
|
|||
#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
|
||||
pub mod aarch64;
|
||||
|
||||
#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
|
||||
pub mod aarch64_linux;
|
||||
#[cfg(all(target_arch = "aarch64", target_feature = "outline-atomics"))]
|
||||
pub mod aarch64_outline_atomics;
|
||||
|
||||
#[cfg(all(
|
||||
kernel_user_helpers,
|
||||
|
|
|
|||
|
|
@ -1,13 +1,49 @@
|
|||
/// Hook into .init_array to enable LSE atomic operations at startup, if
|
||||
/// supported.
|
||||
#[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "compiler-builtins-c")))]
|
||||
/// Enable LSE atomic operations at startup, if supported.
|
||||
///
|
||||
/// Linker sections are based on what [`ctor`] does, with priorities to run slightly before user
|
||||
/// code:
|
||||
///
|
||||
/// - Apple uses the section `__mod_init_func`, `mod_init_funcs` is needed to set
|
||||
/// `S_MOD_INIT_FUNC_POINTERS`. There doesn't seem to be a way to indicate priorities.
|
||||
/// - Windows uses `.CRT$XCT`, which is run before user constructors (these should use `.CRT$XCU`).
|
||||
/// - ELF uses `.init_array` with a priority of 90, which runs before our `ARGV_INIT_ARRAY`
|
||||
/// initializer (priority 99). Both are within the 0-100 implementation-reserved range, per docs
|
||||
/// for the [`prio-ctor-dtor`] warning, and this matches compiler-rt's `CONSTRUCTOR_PRIORITY`.
|
||||
///
|
||||
/// To save startup time, the initializer is only run if outline atomic routines from
|
||||
/// compiler-builtins may be used. If LSE is known to be available then the calls are never
|
||||
/// emitted, and if we build the C intrinsics then it has its own initializer using the symbol
|
||||
/// `__aarch64_have_lse_atomics`.
|
||||
///
|
||||
/// Initialization is done in a global constructor to so we get the same behavior regardless of
|
||||
/// whether Rust's `init` is used, or if we are in a `dylib` or `no_main` situation (as opposed
|
||||
/// to doing it as part of pre-main startup). This also matches C implementations.
|
||||
///
|
||||
/// Ideally `core` would have something similar, but detecting the CPU features requires the
|
||||
/// auxiliary vector from the OS. We do the initialization in `std` rather than as part of
|
||||
/// `compiler-builtins` because a builtins->std dependency isn't possible, and inlining parts of
|
||||
/// `std-detect` would be much messier.
|
||||
///
|
||||
/// [`ctor`]: https://github.com/mmastrac/rust-ctor/blob/63382b833ddcbfb8b064f4e86bfa1ed4026ff356/shared/src/macros/mod.rs#L522-L534
|
||||
/// [`prio-ctor-dtor`]: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
|
||||
#[cfg(all(
|
||||
target_arch = "aarch64",
|
||||
target_feature = "outline-atomics",
|
||||
not(target_feature = "lse"),
|
||||
not(feature = "compiler-builtins-c"),
|
||||
))]
|
||||
#[used]
|
||||
#[unsafe(link_section = ".init_array.90")]
|
||||
#[cfg_attr(target_vendor = "apple", unsafe(link_section = "__DATA,__mod_init_func,mod_init_funcs"))]
|
||||
#[cfg_attr(target_os = "windows", unsafe(link_section = ".CRT$XCT"))]
|
||||
#[cfg_attr(
|
||||
not(any(target_vendor = "apple", target_os = "windows")),
|
||||
unsafe(link_section = ".init_array.90")
|
||||
)]
|
||||
static RUST_LSE_INIT: extern "C" fn() = {
|
||||
extern "C" fn init_lse() {
|
||||
use crate::arch;
|
||||
|
||||
// This is provided by compiler-builtins::aarch64_linux.
|
||||
// This is provided by compiler-builtins::aarch64_outline_atomics.
|
||||
unsafe extern "C" {
|
||||
fn __rust_enable_lse();
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue