Added runtime detection
Expanded the cache size to 93 (we will need this in near future) Fixed detection of VAES, GFNI and VPCLMULQDQ Could not test with `cupid` because they do not support these yet
This commit is contained in:
parent
2fd58a7ac7
commit
1f779b7b40
4 changed files with 66 additions and 20 deletions
|
|
@ -76,6 +76,11 @@ features! {
|
|||
/// * `"avx512bf16"`
|
||||
/// * `"avx512vp2intersect"`
|
||||
/// * `"avx512fp16"`
|
||||
/// * `"avxvnni"`
|
||||
/// * `"avxifma"`
|
||||
/// * `"avxneconvert"`
|
||||
/// * `"avxvnniint8"`
|
||||
/// * `"avxvnniint16"`
|
||||
/// * `"f16c"`
|
||||
/// * `"fma"`
|
||||
/// * `"bmi1"`
|
||||
|
|
@ -172,6 +177,16 @@ features! {
|
|||
/// AVX-512 P2INTERSECT
|
||||
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
|
||||
/// AVX-512 FP16 (FLOAT16 instructions)
|
||||
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma";
|
||||
/// AVX-IFMA (Integer Fused Multiply Add)
|
||||
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert";
|
||||
/// AVX-NE-CONVERT (Exceptionless Convert)
|
||||
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni";
|
||||
/// AVX-VNNI (Vector Neural Network Instructions)
|
||||
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16";
|
||||
/// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
|
||||
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8";
|
||||
/// AVX-VNNI_INT16 (VNNI with 8-bit integers)
|
||||
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
|
||||
/// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
|
||||
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
|
||||
|
|
|
|||
|
|
@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize;
|
|||
|
||||
/// Sets the `bit` of `x`.
|
||||
#[inline]
|
||||
const fn set_bit(x: u64, bit: u32) -> u64 {
|
||||
const fn set_bit(x: u128, bit: u32) -> u128 {
|
||||
x | 1 << bit
|
||||
}
|
||||
|
||||
/// Tests the `bit` of `x`.
|
||||
#[inline]
|
||||
const fn test_bit(x: u64, bit: u32) -> bool {
|
||||
const fn test_bit(x: u128, bit: u32) -> bool {
|
||||
x & (1 << bit) != 0
|
||||
}
|
||||
|
||||
/// Unset the `bit of `x`.
|
||||
#[inline]
|
||||
const fn unset_bit(x: u64, bit: u32) -> u64 {
|
||||
const fn unset_bit(x: u128, bit: u32) -> u128 {
|
||||
x & !(1 << bit)
|
||||
}
|
||||
|
||||
/// Maximum number of features that can be cached.
|
||||
const CACHE_CAPACITY: u32 = 62;
|
||||
const CACHE_CAPACITY: u32 = 93;
|
||||
|
||||
/// This type is used to initialize the cache
|
||||
// The derived `Default` implementation will initialize the field to zero,
|
||||
// which is what we want.
|
||||
#[derive(Copy, Clone, Default)]
|
||||
pub(crate) struct Initializer(u64);
|
||||
pub(crate) struct Initializer(u128);
|
||||
|
||||
// NOTE: the `debug_assert!` would catch that we do not add more Features than
|
||||
// the one fitting our cache.
|
||||
|
|
@ -71,10 +71,15 @@ impl Initializer {
|
|||
}
|
||||
|
||||
/// This global variable is a cache of the features supported by the CPU.
|
||||
// Note: on x64, we only use the first slot
|
||||
static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];
|
||||
// Note: the third slot is only used in x86
|
||||
// Another Slot can be added if needed without any change to `Initializer`
|
||||
static CACHE: [Cache; 3] = [
|
||||
Cache::uninitialized(),
|
||||
Cache::uninitialized(),
|
||||
Cache::uninitialized(),
|
||||
];
|
||||
|
||||
/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
|
||||
/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
|
||||
///
|
||||
/// Note: 0 is used to represent an uninitialized cache, and (at least) the most
|
||||
/// significant bit is set on any cache which has been initialized.
|
||||
|
|
@ -102,7 +107,7 @@ impl Cache {
|
|||
if cached == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(test_bit(cached as u64, bit))
|
||||
Some(test_bit(cached as u128, bit))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -173,6 +178,7 @@ cfg_if::cfg_if! {
|
|||
fn do_initialize(value: Initializer) {
|
||||
CACHE[0].initialize((value.0) as usize & Cache::MASK);
|
||||
CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
|
||||
CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK);
|
||||
}
|
||||
|
||||
// We only have to detect features once, and it's fairly costly, so hint to LLVM
|
||||
|
|
@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer {
|
|||
pub(crate) fn test(bit: u32) -> bool {
|
||||
let (relative_bit, idx) = if bit < Cache::CAPACITY {
|
||||
(bit, 0)
|
||||
} else {
|
||||
} else if bit < 2 * Cache::CAPACITY {
|
||||
(bit - Cache::CAPACITY, 1)
|
||||
} else {
|
||||
(bit - 2 * Cache::CAPACITY, 2)
|
||||
};
|
||||
CACHE[idx]
|
||||
.test(relative_bit)
|
||||
|
|
|
|||
|
|
@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
extended_features_ecx,
|
||||
extended_features_edx,
|
||||
extended_features_eax_leaf_1,
|
||||
extended_features_edx_leaf_1,
|
||||
) = if max_basic_leaf >= 7 {
|
||||
let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
|
||||
let CpuidResult { eax: eax_1, .. } =
|
||||
unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
|
||||
(ebx, ecx, edx, eax_1)
|
||||
let CpuidResult {
|
||||
eax: eax_1,
|
||||
edx: edx_1,
|
||||
..
|
||||
} = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
|
||||
(ebx, ecx, edx, eax_1, edx_1)
|
||||
} else {
|
||||
(0, 0, 0, 0) // CPUID does not support "Extended Features"
|
||||
(0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
|
||||
};
|
||||
|
||||
// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
|
||||
|
|
@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
enable(proc_info_edx, 26, Feature::sse2);
|
||||
enable(extended_features_ebx, 29, Feature::sha);
|
||||
|
||||
enable(extended_features_ecx, 8, Feature::gfni);
|
||||
enable(extended_features_ecx, 9, Feature::vaes);
|
||||
enable(extended_features_ecx, 10, Feature::vpclmulqdq);
|
||||
|
||||
enable(extended_features_ebx, 3, Feature::bmi1);
|
||||
enable(extended_features_ebx, 8, Feature::bmi2);
|
||||
|
||||
|
|
@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
let xcr0 = unsafe { _xgetbv(0) };
|
||||
// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
|
||||
let os_avx_support = xcr0 & 6 == 6;
|
||||
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
|
||||
let os_avx512_support = xcr0 & 224 == 224;
|
||||
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
|
||||
let os_avx512_support = xcr0 & 0xe0 == 0xe0;
|
||||
|
||||
// Only if the OS and the CPU support saving/restoring the AVX
|
||||
// registers we enable `xsave` support:
|
||||
|
|
@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
enable(proc_info_ecx, 28, Feature::avx);
|
||||
enable(extended_features_ebx, 5, Feature::avx2);
|
||||
|
||||
// "Short" versions of AVX512 instructions
|
||||
enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
|
||||
enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
|
||||
enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
|
||||
enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
|
||||
enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);
|
||||
|
||||
// For AVX-512 the OS also needs to support saving/restoring
|
||||
// the extended state, only then we enable AVX-512 support:
|
||||
if os_avx512_support {
|
||||
|
|
@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
enable(extended_features_ebx, 31, Feature::avx512vl);
|
||||
enable(extended_features_ecx, 1, Feature::avx512vbmi);
|
||||
enable(extended_features_ecx, 6, Feature::avx512vbmi2);
|
||||
enable(extended_features_ecx, 8, Feature::gfni);
|
||||
enable(extended_features_ecx, 9, Feature::vaes);
|
||||
enable(extended_features_ecx, 10, Feature::vpclmulqdq);
|
||||
enable(extended_features_ecx, 11, Feature::avx512vnni);
|
||||
enable(extended_features_ecx, 12, Feature::avx512bitalg);
|
||||
enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
#![allow(internal_features)]
|
||||
#![feature(stdarch_internal)]
|
||||
#![feature(stdarch_internal, avx512_target_feature)]
|
||||
|
||||
extern crate cupid;
|
||||
#[macro_use]
|
||||
|
|
@ -68,6 +68,17 @@ fn dump() {
|
|||
println!("adx: {:?}", is_x86_feature_detected!("adx"));
|
||||
println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
|
||||
println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
|
||||
println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
|
||||
println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
|
||||
println!(
|
||||
"avxneconvert: {:?}",
|
||||
is_x86_feature_detected!("avxneconvert")
|
||||
);
|
||||
println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
|
||||
println!(
|
||||
"avxvnniint16: {:?}",
|
||||
is_x86_feature_detected!("avxvnniint16")
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "std_detect_env_override")]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue