Added runtime detection

Expanded the cache size to 93 (we will need this in near future) Fixed detection of VAES, GFNI and VPCLMULQDQ Could not test with `cupid` because they do not support these yet
2024-06-18 12:29:04 +05:30 · 2024-06-18 12:29:04 +05:30 · 1f779b7b40
commit 1f779b7b40
parent 2fd58a7ac7
4 changed files with 66 additions and 20 deletions
--- a/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
+++ b/library/stdarch/crates/std_detect/src/detect/arch/x86.rs
@ -76,6 +76,11 @@ features! {
    /// * `"avx512bf16"`
    /// * `"avx512vp2intersect"`
    /// * `"avx512fp16"`
+    /// * `"avxvnni"`
+    /// * `"avxifma"`
+    /// * `"avxneconvert"`
+    /// * `"avxvnniint8"`
+    /// * `"avxvnniint16"`
    /// * `"f16c"`
    /// * `"fma"`
    /// * `"bmi1"`
@ -172,6 +177,16 @@ features! {
    /// AVX-512 P2INTERSECT
    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
    /// AVX-512 FP16 (FLOAT16 instructions)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma";
+    /// AVX-IFMA (Integer Fused Multiply Add)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert";
+    /// AVX-NE-CONVERT (Exceptionless Convert)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni";
+    /// AVX-VNNI (Vector Neural Network Instructions)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16";
+    /// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8";
+    /// AVX-VNNI_INT16 (VNNI with 8-bit integers)
    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
    /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
--- a/library/stdarch/crates/std_detect/src/detect/cache.rs
+++ b/library/stdarch/crates/std_detect/src/detect/cache.rs
@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize;

 /// Sets the `bit` of `x`.
 #[inline]
-const fn set_bit(x: u64, bit: u32) -> u64 {
+const fn set_bit(x: u128, bit: u32) -> u128 {
    x | 1 << bit
 }

 /// Tests the `bit` of `x`.
 #[inline]
-const fn test_bit(x: u64, bit: u32) -> bool {
+const fn test_bit(x: u128, bit: u32) -> bool {
    x & (1 << bit) != 0
 }

 /// Unset the `bit of `x`.
 #[inline]
-const fn unset_bit(x: u64, bit: u32) -> u64 {
+const fn unset_bit(x: u128, bit: u32) -> u128 {
    x & !(1 << bit)
 }

 /// Maximum number of features that can be cached.
-const CACHE_CAPACITY: u32 = 62;
+const CACHE_CAPACITY: u32 = 93;

 /// This type is used to initialize the cache
 // The derived `Default` implementation will initialize the field to zero,
 // which is what we want.
 #[derive(Copy, Clone, Default)]
-pub(crate) struct Initializer(u64);
+pub(crate) struct Initializer(u128);

 // NOTE: the `debug_assert!` would catch that we do not add more Features than
 // the one fitting our cache.
@ -71,10 +71,15 @@ impl Initializer {
 }

 /// This global variable is a cache of the features supported by the CPU.
-// Note: on x64, we only use the first slot
-static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];
+// Note: the third slot is only used in x86
+// Another Slot can be added if needed without any change to `Initializer`
+static CACHE: [Cache; 3] = [
+    Cache::uninitialized(),
+    Cache::uninitialized(),
+    Cache::uninitialized(),
+];

-/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
+/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
 ///
 /// Note: 0 is used to represent an uninitialized cache, and (at least) the most
 /// significant bit is set on any cache which has been initialized.
@ -102,7 +107,7 @@ impl Cache {
        if cached == 0 {
            None
        } else {
-            Some(test_bit(cached as u64, bit))
+            Some(test_bit(cached as u128, bit))
        }
    }

@ -173,6 +178,7 @@ cfg_if::cfg_if! {
 fn do_initialize(value: Initializer) {
    CACHE[0].initialize((value.0) as usize & Cache::MASK);
    CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
+    CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK);
 }

 // We only have to detect features once, and it's fairly costly, so hint to LLVM
@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer {
 pub(crate) fn test(bit: u32) -> bool {
    let (relative_bit, idx) = if bit < Cache::CAPACITY {
        (bit, 0)
-    } else {
+    } else if bit < 2 * Cache::CAPACITY {
        (bit - Cache::CAPACITY, 1)
+    } else {
+        (bit - 2 * Cache::CAPACITY, 2)
    };
    CACHE[idx]
        .test(relative_bit)
--- a/library/stdarch/crates/std_detect/src/detect/os/x86.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer {
        extended_features_ecx,
        extended_features_edx,
        extended_features_eax_leaf_1,
+        extended_features_edx_leaf_1,
    ) = if max_basic_leaf >= 7 {
        let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
-        let CpuidResult { eax: eax_1, .. } =
-            unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
-        (ebx, ecx, edx, eax_1)
+        let CpuidResult {
+            eax: eax_1,
+            edx: edx_1,
+            ..
+        } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
+        (ebx, ecx, edx, eax_1, edx_1)
    } else {
-        (0, 0, 0, 0) // CPUID does not support "Extended Features"
+        (0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
    };

    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer {
        enable(proc_info_edx, 26, Feature::sse2);
        enable(extended_features_ebx, 29, Feature::sha);

+        enable(extended_features_ecx, 8, Feature::gfni);
+        enable(extended_features_ecx, 9, Feature::vaes);
+        enable(extended_features_ecx, 10, Feature::vpclmulqdq);
+
        enable(extended_features_ebx, 3, Feature::bmi1);
        enable(extended_features_ebx, 8, Feature::bmi2);

@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
                let xcr0 = unsafe { _xgetbv(0) };
                // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
                let os_avx_support = xcr0 & 6 == 6;
-                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
-                let os_avx512_support = xcr0 & 224 == 224;
+                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
+                let os_avx512_support = xcr0 & 0xe0 == 0xe0;

                // Only if the OS and the CPU support saving/restoring the AVX
                // registers we enable `xsave` support:
@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
                    enable(proc_info_ecx, 28, Feature::avx);
                    enable(extended_features_ebx, 5, Feature::avx2);

+                    // "Short" versions of AVX512 instructions
+                    enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
+                    enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
+                    enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
+                    enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
+                    enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);
+
                    // For AVX-512 the OS also needs to support saving/restoring
                    // the extended state, only then we enable AVX-512 support:
                    if os_avx512_support {
@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer {
                        enable(extended_features_ebx, 31, Feature::avx512vl);
                        enable(extended_features_ecx, 1, Feature::avx512vbmi);
                        enable(extended_features_ecx, 6, Feature::avx512vbmi2);
-                        enable(extended_features_ecx, 8, Feature::gfni);
-                        enable(extended_features_ecx, 9, Feature::vaes);
-                        enable(extended_features_ecx, 10, Feature::vpclmulqdq);
                        enable(extended_features_ecx, 11, Feature::avx512vnni);
                        enable(extended_features_ecx, 12, Feature::avx512bitalg);
                        enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
--- a/library/stdarch/crates/std_detect/tests/x86-specific.rs
+++ b/library/stdarch/crates/std_detect/tests/x86-specific.rs
@ -1,6 +1,6 @@
 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #![allow(internal_features)]
-#![feature(stdarch_internal)]
+#![feature(stdarch_internal, avx512_target_feature)]

 extern crate cupid;
 #[macro_use]
@ -68,6 +68,17 @@ fn dump() {
    println!("adx: {:?}", is_x86_feature_detected!("adx"));
    println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
    println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
+    println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
+    println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
+    println!(
+        "avxneconvert: {:?}",
+        is_x86_feature_detected!("avxneconvert")
+    );
+    println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
+    println!(
+        "avxvnniint16: {:?}",
+        is_x86_feature_detected!("avxvnniint16")
+    );
 }

 #[cfg(feature = "std_detect_env_override")]