Added runtime detection

Expanded the cache size to 93 (we will need this in near future)
Fixed detection of VAES, GFNI and VPCLMULQDQ
Could not test with `cupid` because they do not support these yet
This commit is contained in:
sayantn 2024-06-18 12:29:04 +05:30 committed by Amanieu d'Antras
parent 2fd58a7ac7
commit 1f779b7b40
4 changed files with 66 additions and 20 deletions

View file

@ -76,6 +76,11 @@ features! {
/// * `"avx512bf16"`
/// * `"avx512vp2intersect"`
/// * `"avx512fp16"`
/// * `"avxvnni"`
/// * `"avxifma"`
/// * `"avxneconvert"`
/// * `"avxvnniint8"`
/// * `"avxvnniint16"`
/// * `"f16c"`
/// * `"fma"`
/// * `"bmi1"`
@ -172,6 +177,16 @@ features! {
/// AVX-512 P2INTERSECT
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
/// AVX-512 FP16 (FLOAT16 instructions)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma";
/// AVX-IFMA (Integer Fused Multiply Add)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert";
/// AVX-NE-CONVERT (Exceptionless Convert)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni";
/// AVX-VNNI (Vector Neural Network Instructions)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16";
/// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8";
/// AVX-VNNI_INT16 (VNNI with 8-bit integers)
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
/// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";

View file

@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize;
/// Sets the `bit` of `x`.
#[inline]
const fn set_bit(x: u64, bit: u32) -> u64 {
const fn set_bit(x: u128, bit: u32) -> u128 {
x | 1 << bit
}
/// Tests the `bit` of `x`.
#[inline]
const fn test_bit(x: u64, bit: u32) -> bool {
const fn test_bit(x: u128, bit: u32) -> bool {
x & (1 << bit) != 0
}
/// Unset the `bit of `x`.
#[inline]
const fn unset_bit(x: u64, bit: u32) -> u64 {
const fn unset_bit(x: u128, bit: u32) -> u128 {
x & !(1 << bit)
}
/// Maximum number of features that can be cached.
const CACHE_CAPACITY: u32 = 62;
const CACHE_CAPACITY: u32 = 93;
/// This type is used to initialize the cache
// The derived `Default` implementation will initialize the field to zero,
// which is what we want.
#[derive(Copy, Clone, Default)]
pub(crate) struct Initializer(u64);
pub(crate) struct Initializer(u128);
// NOTE: the `debug_assert!` would catch that we do not add more Features than
// the one fitting our cache.
@ -71,10 +71,15 @@ impl Initializer {
}
/// This global variable is a cache of the features supported by the CPU.
// Note: on x64, we only use the first slot
static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];
// Note: the third slot is only used in x86
// Another Slot can be added if needed without any change to `Initializer`
static CACHE: [Cache; 3] = [
Cache::uninitialized(),
Cache::uninitialized(),
Cache::uninitialized(),
];
/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
///
/// Note: 0 is used to represent an uninitialized cache, and (at least) the most
/// significant bit is set on any cache which has been initialized.
@ -102,7 +107,7 @@ impl Cache {
if cached == 0 {
None
} else {
Some(test_bit(cached as u64, bit))
Some(test_bit(cached as u128, bit))
}
}
@ -173,6 +178,7 @@ cfg_if::cfg_if! {
fn do_initialize(value: Initializer) {
CACHE[0].initialize((value.0) as usize & Cache::MASK);
CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK);
}
// We only have to detect features once, and it's fairly costly, so hint to LLVM
@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer {
pub(crate) fn test(bit: u32) -> bool {
let (relative_bit, idx) = if bit < Cache::CAPACITY {
(bit, 0)
} else {
} else if bit < 2 * Cache::CAPACITY {
(bit - Cache::CAPACITY, 1)
} else {
(bit - 2 * Cache::CAPACITY, 2)
};
CACHE[idx]
.test(relative_bit)

View file

@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer {
extended_features_ecx,
extended_features_edx,
extended_features_eax_leaf_1,
extended_features_edx_leaf_1,
) = if max_basic_leaf >= 7 {
let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
let CpuidResult { eax: eax_1, .. } =
unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
(ebx, ecx, edx, eax_1)
let CpuidResult {
eax: eax_1,
edx: edx_1,
..
} = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
(ebx, ecx, edx, eax_1, edx_1)
} else {
(0, 0, 0, 0) // CPUID does not support "Extended Features"
(0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
};
// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(proc_info_edx, 26, Feature::sse2);
enable(extended_features_ebx, 29, Feature::sha);
enable(extended_features_ecx, 8, Feature::gfni);
enable(extended_features_ecx, 9, Feature::vaes);
enable(extended_features_ecx, 10, Feature::vpclmulqdq);
enable(extended_features_ebx, 3, Feature::bmi1);
enable(extended_features_ebx, 8, Feature::bmi2);
@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
let xcr0 = unsafe { _xgetbv(0) };
// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
let os_avx_support = xcr0 & 6 == 6;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
let os_avx512_support = xcr0 & 224 == 224;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
let os_avx512_support = xcr0 & 0xe0 == 0xe0;
// Only if the OS and the CPU support saving/restoring the AVX
// registers we enable `xsave` support:
@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(proc_info_ecx, 28, Feature::avx);
enable(extended_features_ebx, 5, Feature::avx2);
// "Short" versions of AVX512 instructions
enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);
// For AVX-512 the OS also needs to support saving/restoring
// the extended state, only then we enable AVX-512 support:
if os_avx512_support {
@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(extended_features_ebx, 31, Feature::avx512vl);
enable(extended_features_ecx, 1, Feature::avx512vbmi);
enable(extended_features_ecx, 6, Feature::avx512vbmi2);
enable(extended_features_ecx, 8, Feature::gfni);
enable(extended_features_ecx, 9, Feature::vaes);
enable(extended_features_ecx, 10, Feature::vpclmulqdq);
enable(extended_features_ecx, 11, Feature::avx512vnni);
enable(extended_features_ecx, 12, Feature::avx512bitalg);
enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);

View file

@ -1,6 +1,6 @@
#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#![allow(internal_features)]
#![feature(stdarch_internal)]
#![feature(stdarch_internal, avx512_target_feature)]
extern crate cupid;
#[macro_use]
@ -68,6 +68,17 @@ fn dump() {
println!("adx: {:?}", is_x86_feature_detected!("adx"));
println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
println!(
"avxneconvert: {:?}",
is_x86_feature_detected!("avxneconvert")
);
println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
println!(
"avxvnniint16: {:?}",
is_x86_feature_detected!("avxvnniint16")
);
}
#[cfg(feature = "std_detect_env_override")]