Add a x86_64::cmpxchg16b intrinsic
This intrinsic isn't actually specified by Intel, but it's something gated with CPUID and can otherwise be a useful thing to have when building primitives! There exists an `AtomicU128` type in the standard library but it's only exposed currently (and it's unstable) when a platform fully supports 128-bit atomics. The x86_64 architecture does not support it *unless* the `cmpxchg16b` instruction is available, and it isn't always available! This commit is also a proposal for how we can include support for 128-bit atomics in the standard library on relevant platforms. I'm thinking that we'll expose this one low-level intrinsic in `std::arch::x86_64`, and then if desired a crate on crates.io can build `AtomicU128` from this API. In any case this is all unstable regardless!
This commit is contained in:
parent
d30c29e926
commit
9c4e418fe0
8 changed files with 105 additions and 1 deletions
74
library/stdarch/coresimd/x86_64/cmpxchg16b.rs
Normal file
74
library/stdarch/coresimd/x86_64/cmpxchg16b.rs
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
use sync::atomic::Ordering;
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
/// Compare and exchange 16 bytes (128 bits) of data atomically.
|
||||
///
|
||||
/// This intrinsic corresponds to the `cmpxchg16b` instruction on x86_64
|
||||
/// processors. It performs an atomic compare-and-swap, updating the `ptr`
|
||||
/// memory location to `val` if the current value in memory equals `old`.
|
||||
///
|
||||
/// # Return value
|
||||
///
|
||||
/// This function returns the previous value at the memory location. If it is
|
||||
/// equal to `old` then the memory was updated to `new`.
|
||||
///
|
||||
/// # Memory Orderings
|
||||
///
|
||||
/// This atomic operations has the same semantics of memory orderings as
|
||||
/// `AtomicUsize::compare_exchange` does, only operating on 16 bytes of memory
|
||||
/// instead of just a pointer.
|
||||
///
|
||||
/// For more information on memory orderings here see the `compare_exchange`
|
||||
/// documentation for other `Atomic*` types in the standard library.
|
||||
///
|
||||
/// # Unsafety
|
||||
///
|
||||
/// This method is unsafe because it takes a raw pointer and will attempt to
|
||||
/// read and possibly write the memory at the pointer. The pointer must also be
|
||||
/// aligned on a 16-byte boundary.
|
||||
///
|
||||
/// This method also requires the `cmpxchg16b` CPU feature to be available at
|
||||
/// runtime to work correctly. If the CPU running the binary does not actually
|
||||
/// support `cmpxchg16b` and the program enters an execution path that
|
||||
/// eventually would reach this function the behavior is undefined.
|
||||
///
|
||||
/// The `success` ordering must also be stronger or equal to `failure`, or this
|
||||
/// function call is undefined. See the `Atomic*` documentation's
|
||||
/// `compare_exchange` function for more information. When `compare_exchange`
|
||||
/// panics, this is undefined behavior. Currently this function aborts the
|
||||
/// process with an undefined instruction.
|
||||
#[inline]
|
||||
#[cfg_attr(test, assert_instr(cmpxchg16b, success = Ordering::SeqCst, failure = Ordering::SeqCst))]
|
||||
#[target_feature(enable = "cmpxchg16b")]
|
||||
pub unsafe fn cmpxchg16b(
|
||||
dst: *mut u128,
|
||||
old: u128,
|
||||
new: u128,
|
||||
success: Ordering,
|
||||
failure: Ordering,
|
||||
) -> u128 {
|
||||
use intrinsics;
|
||||
use sync::atomic::Ordering::*;
|
||||
|
||||
debug_assert!(dst as usize % 16 == 0);
|
||||
|
||||
let (val, _ok) = match (success, failure) {
|
||||
(Acquire, Acquire) => intrinsics::atomic_cxchg_acq(dst, old, new),
|
||||
(Release, Relaxed) => intrinsics::atomic_cxchg_rel(dst, old, new),
|
||||
(AcqRel, Acquire) => intrinsics::atomic_cxchg_acqrel(dst, old, new),
|
||||
(Relaxed, Relaxed) => intrinsics::atomic_cxchg_relaxed(dst, old, new),
|
||||
(SeqCst, SeqCst) => intrinsics::atomic_cxchg(dst, old, new),
|
||||
(Acquire, Relaxed) => intrinsics::atomic_cxchg_acq_failrelaxed(dst, old, new),
|
||||
(AcqRel, Relaxed) => intrinsics::atomic_cxchg_acqrel_failrelaxed(dst, old, new),
|
||||
(SeqCst, Relaxed) => intrinsics::atomic_cxchg_failrelaxed(dst, old, new),
|
||||
(SeqCst, Acquire) => intrinsics::atomic_cxchg_failacq(dst, old, new),
|
||||
|
||||
// The above block is all copied from libcore, and this statement is
|
||||
// also copied from libcore except that it's a panic in libcore and we
|
||||
// have a little bit more of a lightweight panic here.
|
||||
_ => ::coresimd::x86::ud2(),
|
||||
};
|
||||
val
|
||||
}
|
||||
|
|
@ -38,3 +38,6 @@ pub use self::bswap::*;
|
|||
|
||||
mod rdrand;
|
||||
pub use self::rdrand::*;
|
||||
|
||||
mod cmpxchg16b;
|
||||
pub use self::cmpxchg16b::*;
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@
|
|||
sse4a_target_feature,
|
||||
arm_target_feature,
|
||||
aarch64_target_feature,
|
||||
cmpxchg16b_target_feature,
|
||||
avx512_target_feature,
|
||||
mips_target_feature,
|
||||
powerpc_target_feature,
|
||||
|
|
@ -74,6 +75,8 @@
|
|||
test(attr(allow(dead_code, deprecated, unused_variables, unused_mut)))
|
||||
)]
|
||||
|
||||
#[macro_use]
|
||||
#[allow(unused_imports)]
|
||||
extern crate core as _core;
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
|
|
@ -129,6 +132,8 @@ use _core::result;
|
|||
#[allow(unused_imports)]
|
||||
use _core::slice;
|
||||
#[allow(unused_imports)]
|
||||
use _core::sync;
|
||||
#[allow(unused_imports)]
|
||||
use _core::u128;
|
||||
#[allow(unused_imports)]
|
||||
use _core::u8;
|
||||
|
|
|
|||
|
|
@ -101,7 +101,9 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
.skip_while(|s| {
|
||||
s.len() == expected_len
|
||||
&& usize::from_str_radix(s, 16).is_ok()
|
||||
}).map(|s| s.to_string())
|
||||
})
|
||||
.skip_while(|s| *s == "lock") // skip x86-specific prefix
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>();
|
||||
instructions.push(Instruction { parts });
|
||||
}
|
||||
|
|
@ -198,6 +200,7 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
.skip_while(|s| {
|
||||
s.len() == 2 && usize::from_str_radix(s, 16).is_ok()
|
||||
}).map(|s| s.to_string())
|
||||
.skip_while(|s| *s == "lock") // skip x86-specific prefix
|
||||
.collect::<Vec<String>>();
|
||||
instructions.push(Instruction { parts });
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,7 +124,9 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
|
|||
"u16" => quote! { &U16 },
|
||||
"u32" => quote! { &U32 },
|
||||
"u64" => quote! { &U64 },
|
||||
"u128" => quote! { &U128 },
|
||||
"u8" => quote! { &U8 },
|
||||
"Ordering" => quote! { &ORDERING },
|
||||
"CpuidResult" => quote! { &CPUID },
|
||||
|
||||
// arm ...
|
||||
|
|
|
|||
|
|
@ -37,7 +37,9 @@ static I8: Type = Type::PrimSigned(8);
|
|||
static U16: Type = Type::PrimUnsigned(16);
|
||||
static U32: Type = Type::PrimUnsigned(32);
|
||||
static U64: Type = Type::PrimUnsigned(64);
|
||||
static U128: Type = Type::PrimUnsigned(128);
|
||||
static U8: Type = Type::PrimUnsigned(8);
|
||||
static ORDERING: Type = Type::Ordering;
|
||||
|
||||
static M64: Type = Type::M64;
|
||||
static M128: Type = Type::M128;
|
||||
|
|
@ -75,6 +77,7 @@ enum Type {
|
|||
Tuple,
|
||||
CpuidResult,
|
||||
Never,
|
||||
Ordering,
|
||||
}
|
||||
|
||||
stdsimd_verify::x86_functions!(static FUNCTIONS);
|
||||
|
|
@ -145,6 +148,8 @@ fn verify_all_signatures() {
|
|||
"__cpuid_count" |
|
||||
"__cpuid" |
|
||||
"__get_cpuid_max" |
|
||||
// Not listed with intel, but manually verified
|
||||
"cmpxchg16b" |
|
||||
// The UD2 intrinsic is not defined by Intel, but it was agreed on
|
||||
// in the RFC Issue 2512:
|
||||
// https://github.com/rust-lang/rfcs/issues/2512
|
||||
|
|
|
|||
|
|
@ -226,6 +226,10 @@ macro_rules! is_x86_feature_detected {
|
|||
cfg!(target_feature = "xsavec") || $crate::arch::detect::check_for(
|
||||
$crate::arch::detect::Feature::xsavec)
|
||||
};
|
||||
("cmpxchg16b") => {
|
||||
cfg!(target_feature = "cmpxchg16b") || $crate::arch::detect::check_for(
|
||||
$crate::arch::detect::Feature::cmpxchg16b)
|
||||
};
|
||||
($t:tt) => {
|
||||
compile_error!(concat!("unknown target feature: ", $t))
|
||||
};
|
||||
|
|
@ -316,4 +320,6 @@ pub enum Feature {
|
|||
xsaves,
|
||||
/// XSAVEC (Save Processor Extended States Compacted)
|
||||
xsavec,
|
||||
/// CMPXCH16B, a 16-byte compare-and-swap instruction
|
||||
cmpxchg16b,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -116,6 +116,7 @@ fn detect_features() -> cache::Initializer {
|
|||
|
||||
enable(proc_info_ecx, 0, Feature::sse3);
|
||||
enable(proc_info_ecx, 9, Feature::ssse3);
|
||||
enable(proc_info_ecx, 13, Feature::cmpxchg16b);
|
||||
enable(proc_info_ecx, 19, Feature::sse4_1);
|
||||
enable(proc_info_ecx, 20, Feature::sse4_2);
|
||||
enable(proc_info_ecx, 23, Feature::popcnt);
|
||||
|
|
@ -288,6 +289,7 @@ mod tests {
|
|||
println!("xsaveopt: {:?}", is_x86_feature_detected!("xsaveopt"));
|
||||
println!("xsaves: {:?}", is_x86_feature_detected!("xsaves"));
|
||||
println!("xsavec: {:?}", is_x86_feature_detected!("xsavec"));
|
||||
println!("cmpxchg16b: {:?}", is_x86_feature_detected!("cmpxchg16b"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -344,5 +346,9 @@ mod tests {
|
|||
is_x86_feature_detected!("xsaves"),
|
||||
information.xsaves_xrstors_and_ia32_xss()
|
||||
);
|
||||
assert_eq!(
|
||||
is_x86_feature_detected!("cmpxchg16b"),
|
||||
information.cmpxchg16b(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue