Stabilize x86/x86_64 intrinsics (#414)

This commit stabilizes all intrinsics in the `x86` and `x86_64` modules, namely allowing stabilization of the `arch::x86` and `arch::x86_64` module in libstd. Stabilizations here were applied in an automated fashion using [this script][scr], and notably everything related to `__m64` was omitted from this round of stabilization [scr]: https://gist.github.com/alexcrichton/5b456d495d6fe1df46a158754565c7a5
2018-04-13 09:32:22 -05:00 · 2018-04-13 09:32:22 -05:00 · f650b93003
commit f650b93003
parent b89963711d
42 changed files with 2897 additions and 15 deletions
--- a/library/stdarch/coresimd/arm/mod.rs
+++ b/library/stdarch/coresimd/arm/mod.rs
@ -20,8 +20,10 @@ pub use self::v7::*;
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
 #[cfg(any(target_arch = "aarch64",
-          all(target_feature = "v7", target_feature = "neon")))]
+          all(target_feature = "v7", target_feature = "neon"),
+          dox))]
 mod neon;
 #[cfg(any(target_arch = "aarch64",
-          all(target_feature = "v7", target_feature = "neon")))]
+          all(target_feature = "v7", target_feature = "neon"),
+          dox))]
 pub use self::neon::*;
--- a/library/stdarch/coresimd/mod.rs
+++ b/library/stdarch/coresimd/mod.rs
@ -41,14 +41,16 @@ pub mod simd {
 /// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/index.html
 /// [`mips`]: https://rust-lang-nursery.github.io/stdsimd/mips/stdsimd/arch/index.html
 /// [`mips64`]: https://rust-lang-nursery.github.io/stdsimd/mips64/stdsimd/arch/index.html
-#[unstable(feature = "stdsimd", issue = "0")]
+#[stable(feature = "simd_arch", since = "1.27.0")]
 pub mod arch {
    /// Platform-specific intrinsics for the `x86` platform.
    ///
    /// See the [module documentation](../index.html) for more details.
    #[cfg(any(target_arch = "x86", dox))]
    #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub mod x86 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
        pub use coresimd::x86::*;
    }

@ -57,8 +59,11 @@ pub mod arch {
    /// See the [module documentation](../index.html) for more details.
    #[cfg(any(target_arch = "x86_64", dox))]
    #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub mod x86_64 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
        pub use coresimd::x86::*;
+        #[stable(feature = "simd_x86", since = "1.27.0")]
        pub use coresimd::x86_64::*;
    }

@ -67,6 +72,7 @@ pub mod arch {
    /// See the [module documentation](../index.html) for more details.
    #[cfg(any(target_arch = "arm", dox))]
    #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod arm {
        pub use coresimd::arm::*;
    }
@ -76,6 +82,7 @@ pub mod arch {
    /// See the [module documentation](../index.html) for more details.
    #[cfg(any(target_arch = "aarch64", dox))]
    #[doc(cfg(target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod aarch64 {
        pub use coresimd::aarch64::*;
        pub use coresimd::arm::*;
@ -85,6 +92,7 @@ pub mod arch {
    ///
    /// See the [module documentation](../index.html) for more details.
    #[cfg(target_arch = "wasm32")]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod wasm32 {
        pub use coresimd::wasm32::*;
    }
@ -94,6 +102,7 @@ pub mod arch {
    /// See the [module documentation](../index.html) for more details.
    #[cfg(any(target_arch = "mips", dox))]
    #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod mips {
        pub use coresimd::mips::*;
    }
@ -103,6 +112,7 @@ pub mod arch {
    /// See the [module documentation](../index.html) for more details.
    #[cfg(any(target_arch = "mips64", dox))]
    #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod mips64 {
        pub use coresimd::mips::*;
    }
--- a/library/stdarch/coresimd/x86/abm.rs
+++ b/library/stdarch/coresimd/x86/abm.rs
@ -23,17 +23,23 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u32)
 #[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
    x.leading_zeros()
 }

 /// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt32)
 #[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _popcnt32(x: i32) -> i32 {
    x.count_ones() as i32
 }
--- a/library/stdarch/coresimd/x86/aes.rs
+++ b/library/stdarch/coresimd/x86/aes.rs
@ -29,41 +29,56 @@ extern "C" {
 }

 /// Perform one round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdec_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesdec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
    aesdec(a, round_key)
 }

 /// Perform the last round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdeclast_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesdeclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
    aesdeclast(a, round_key)
 }

 /// Perform one round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenc_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesenc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
    aesenc(a, round_key)
 }

 /// Perform the last round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesenclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
    aesenclast(a, round_key)
 }

 /// Perform the `InvMixColumns` transformation on `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesimc_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesimc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
    aesimc(a)
 }
@ -73,10 +88,13 @@ pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
 /// Assist in expanding the AES cipher key by computing steps towards
 /// generating a round key for encryption cipher using data from `a` and an
 /// 8-bit round constant `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aeskeygenassist_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aeskeygenassist, imm8 = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aeskeygenassist_si128(a: __m128i, imm8: i32) -> __m128i {
    macro_rules! call {
        ($imm8:expr) => {
--- a/library/stdarch/coresimd/x86/avx.rs
+++ b/library/stdarch/coresimd/x86/avx.rs
--- a/library/stdarch/coresimd/x86/avx2.rs
+++ b/library/stdarch/coresimd/x86/avx2.rs
--- a/library/stdarch/coresimd/x86/bmi1.rs
+++ b/library/stdarch/coresimd/x86/bmi1.rs
@ -14,9 +14,12 @@ use stdsimd_test::assert_instr;

 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
    _bextr2_u32(
        a,
@ -29,33 +32,45 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
    x86_bmi_bextr_32(a, control)
 }

 /// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
    !a & b
 }

 /// Extract lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsi_u32(x: u32) -> u32 {
    x & x.wrapping_neg()
 }

 /// Get mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
    x ^ (x.wrapping_sub(1_u32))
 }
@ -63,9 +78,12 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsr_u32(x: u32) -> u32 {
    x & (x.wrapping_sub(1))
 }
@ -73,9 +91,12 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
    x.trailing_zeros()
 }
@ -83,9 +104,12 @@ pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
    x.trailing_zeros() as i32
 }
--- a/library/stdarch/coresimd/x86/bmi2.rs
+++ b/library/stdarch/coresimd/x86/bmi2.rs
@ -17,11 +17,14 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u32)
 #[inline]
 // LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
 #[target_feature(enable = "bmi2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
    let result: u64 = (a as u64) * (b as u64);
    *hi = (result >> 32) as u32;
@ -29,27 +32,36 @@ pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
 }

 /// Zero higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u32)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
    x86_bmi2_bzhi_32(a, index)
 }

 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u32)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
    x86_bmi2_pdep_32(a, mask)
 }

 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u32)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
    x86_bmi2_pext_32(a, mask)
 }
--- a/library/stdarch/coresimd/x86/bswap.rs
+++ b/library/stdarch/coresimd/x86/bswap.rs
@ -6,8 +6,11 @@
 use stdsimd_test::assert_instr;

 /// Return an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap)
 #[inline]
 #[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bswap(x: i32) -> i32 {
    bswap_i32(x)
 }
--- a/library/stdarch/coresimd/x86/cpuid.rs
+++ b/library/stdarch/coresimd/x86/cpuid.rs
@ -10,14 +10,19 @@ use stdsimd_test::assert_instr;
 /// Result of the `cpuid` instruction.
 #[derive(Copy, Clone, Eq, Ord, PartialEq, PartialOrd)]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub struct CpuidResult {
    /// EAX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub eax: u32,
    /// EBX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub ebx: u32,
    /// ECX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub ecx: u32,
    /// EDX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub edx: u32,
 }

@ -46,6 +51,7 @@ pub struct CpuidResult {
 /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 #[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
    let mut r = mem::uninitialized::<CpuidResult>();
    if cfg!(target_arch = "x86") {
@ -66,6 +72,7 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
 /// See [`__cpuid_count`](fn.__cpuid_count.html).
 #[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
    __cpuid_count(leaf, 0)
 }
@ -114,6 +121,7 @@ pub fn has_cpuid() -> bool {
 /// See also [`__cpuid`](fn.__cpuid.html) and
 /// [`__cpuid_count`](fn.__cpuid_count.html).
 #[inline]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
    let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
    (eax, ebx)
--- a/library/stdarch/coresimd/x86/eflags.rs
+++ b/library/stdarch/coresimd/x86/eflags.rs
@ -1,8 +1,11 @@
 //! `i386` intrinsics

 /// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
 #[cfg(target_arch = "x86")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __readeflags() -> u32 {
    let eflags: u32;
    asm!("pushfd; popl $0" : "=r"(eflags) : : : "volatile");
@ -10,8 +13,11 @@ pub unsafe fn __readeflags() -> u32 {
 }

 /// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __readeflags() -> u64 {
    let eflags: u64;
    asm!("pushfq; popq $0" : "=r"(eflags) : : : "volatile");
@ -19,15 +25,21 @@ pub unsafe fn __readeflags() -> u64 {
 }

 /// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
 #[cfg(target_arch = "x86")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __writeeflags(eflags: u32) {
    asm!("pushl $0; popfd" : : "r"(eflags) : "cc", "flags" : "volatile");
 }

 /// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __writeeflags(eflags: u64) {
    asm!("pushq $0; popfq" : : "r"(eflags) : "cc", "flags" : "volatile");
 }
--- a/library/stdarch/coresimd/x86/fxsr.rs
+++ b/library/stdarch/coresimd/x86/fxsr.rs
@ -21,9 +21,12 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxsave(mem_addr: *mut u8) {
    fxsave(mem_addr)
 }
@ -42,9 +45,12 @@ pub unsafe fn _fxsave(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxrstor(mem_addr: *const u8) {
    fxrstor(mem_addr)
 }
--- a/library/stdarch/coresimd/x86/mod.rs
+++ b/library/stdarch/coresimd/x86/mod.rs
@ -33,7 +33,8 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![feature(stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -83,7 +84,7 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature, stdsimd))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -105,6 +106,7 @@ types! {
    /// # if is_x86_feature_detected!("sse2") { unsafe { foo() } }
    /// # }
    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub struct __m128i(i64, i64);

    /// 128-bit wide set of four `f32` types, x86-specific
@ -126,7 +128,7 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature, stdsimd))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -148,6 +150,7 @@ types! {
    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
    /// # }
    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub struct __m128(f32, f32, f32, f32);

    /// 128-bit wide set of two `f64` types, x86-specific
@ -169,7 +172,7 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature, stdsimd))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -191,6 +194,7 @@ types! {
    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
    /// # }
    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub struct __m128d(f64, f64);

    /// 256-bit wide integer vector type, x86-specific
@ -216,7 +220,7 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature, stdsimd))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -238,6 +242,7 @@ types! {
    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
    /// # }
    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub struct __m256i(i64, i64, i64, i64);

    /// 256-bit wide set of eight `f32` types, x86-specific
@ -259,7 +264,7 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature, stdsimd))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -281,6 +286,7 @@ types! {
    /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
    /// # }
    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub struct __m256(f32, f32, f32, f32, f32, f32, f32, f32);

    /// 256-bit wide set of four `f64` types, x86-specific
@ -302,7 +308,7 @@ types! {
    /// # Examples
    ///
    /// ```
-    /// # #![feature(cfg_target_feature, target_feature, stdsimd)]
+    /// # #![cfg_attr(not(dox), feature(cfg_target_feature, target_feature, stdsimd))]
    /// # #![cfg_attr(not(dox), no_std)]
    /// # #[cfg(not(dox))]
    /// # extern crate std as real_std;
@ -324,6 +330,7 @@ types! {
    /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
    /// # }
    /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub struct __m256d(f64, f64, f64, f64);
 }

@ -334,6 +341,7 @@ pub use self::test::*;

 #[doc(hidden)]
 #[allow(non_camel_case_types)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub(crate) trait m128iExt: Sized {
    fn as_m128i(self) -> __m128i;

@ -387,6 +395,7 @@ impl m128iExt for __m128i {

 #[doc(hidden)]
 #[allow(non_camel_case_types)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub(crate) trait m256iExt: Sized {
    fn as_m256i(self) -> __m256i;

@ -590,8 +599,8 @@ pub use self::avx2::*;

 mod abm;
 pub use self::abm::*;
-mod bmi;
-pub use self::bmi::*;
+mod bmi1;
+pub use self::bmi1::*;

 mod bmi2;
 pub use self::bmi2::*;
--- a/library/stdarch/coresimd/x86/pclmulqdq.rs
+++ b/library/stdarch/coresimd/x86/pclmulqdq.rs
@ -21,6 +21,8 @@ extern "C" {
 ///
 /// The immediate byte is used for determining which halves of `a` and `b`
 /// should be used. Immediate bits other than 0 and 4 are ignored.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
 #[inline]
 #[target_feature(enable = "pclmulqdq")]
 #[cfg_attr(all(test, not(target_os = "linux")),
@ -34,6 +36,7 @@ extern "C" {
 #[cfg_attr(all(test, target_os = "linux"),
           assert_instr(pclmulhqhqdq, imm8 = 17))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_clmulepi64_si128(
    a: __m128i, b: __m128i, imm8: i32
 ) -> __m128i {
--- a/library/stdarch/coresimd/x86/rdrand.rs
+++ b/library/stdarch/coresimd/x86/rdrand.rs
@ -14,10 +14,13 @@ use stdsimd_test::assert_instr;

 /// Read a hardware generated 16-bit random value and store the result in val.
 /// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand16_step)
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
    let (v, flag) = x86_rdrand16_step();
    *val = v;
@ -26,10 +29,13 @@ pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {

 /// Read a hardware generated 32-bit random value and store the result in val.
 /// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand32_step)
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
    let (v, flag) = x86_rdrand32_step();
    *val = v;
@ -38,9 +44,12 @@ pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {

 /// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
 /// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed16_step)
 #[inline]
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
    let (v, flag) = x86_rdseed16_step();
    *val = v;
@ -49,9 +58,12 @@ pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {

 /// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
 /// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed32_step)
 #[inline]
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
    let (v, flag) = x86_rdseed32_step();
    *val = v;
--- a/library/stdarch/coresimd/x86/rdtsc.rs
+++ b/library/stdarch/coresimd/x86/rdtsc.rs
@ -17,8 +17,11 @@ use stdsimd_test::assert_instr;
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX and RDX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdtsc)
 #[inline]
 #[cfg_attr(test, assert_instr(rdtsc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdtsc() -> i64 {
    rdtsc()
 }
@ -37,8 +40,11 @@ pub unsafe fn _rdtsc() -> i64 {
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__rdtscp)
 #[inline]
 #[cfg_attr(test, assert_instr(rdtscp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
    rdtscp(aux as *mut _)
 }
--- a/library/stdarch/coresimd/x86/sha.rs
+++ b/library/stdarch/coresimd/x86/sha.rs
@ -26,9 +26,12 @@ use stdsimd_test::assert_instr;
 /// Perform an intermediate calculation for the next four SHA1 message values
 /// (unsigned 32-bit integers) using previous message values from `a` and `b`,
 /// and returning the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg1_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(sha1msg1(a.as_i32x4(), b.as_i32x4()))
 }
@ -36,9 +39,12 @@ pub unsafe fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Perform the final calculation for the next four SHA1 message values
 /// (unsigned 32-bit integers) using the intermediate result in `a` and the
 /// previous message values in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg2_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(sha1msg2(a.as_i32x4(), b.as_i32x4()))
 }
@ -46,9 +52,12 @@ pub unsafe fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Calculate SHA1 state variable E after four rounds of operation from the
 /// current SHA1 state variable `a`, add that value to the scheduled values
 /// (unsigned 32-bit integers) in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1nexte_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1nexte))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(sha1nexte(a.as_i32x4(), b.as_i32x4()))
 }
@ -58,10 +67,13 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// (unsigned 32-bit integers), and state variable E from `b`, and return the
 /// updated SHA1 state (A,B,C,D). `func` contains the logic functions and round
 /// constants.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1rnds4_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1rnds4, func = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1rnds4_epu32(
    a: __m128i, b: __m128i, func: i32
 ) -> __m128i {
@ -79,9 +91,12 @@ pub unsafe fn _mm_sha1rnds4_epu32(
 /// Perform an intermediate calculation for the next four SHA256 message values
 /// (unsigned 32-bit integers) using previous message values from `a` and `b`,
 /// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg1_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha256msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(sha256msg1(a.as_i32x4(), b.as_i32x4()))
 }
@ -89,9 +104,12 @@ pub unsafe fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Perform the final calculation for the next four SHA256 message values
 /// (unsigned 32-bit integers) using previous message values from `a` and `b`,
 /// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg2_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha256msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(sha256msg2(a.as_i32x4(), b.as_i32x4()))
 }
@ -101,9 +119,12 @@ pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// pre-computed sum of the next 2 round message values (unsigned 32-bit
 /// integers) and the corresponding round constants from `k`, and store the
 /// updated SHA256 state (A,B,E,F) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256rnds2_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha256rnds2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256rnds2_epu32(
    a: __m128i, b: __m128i, k: __m128i
 ) -> __m128i {
--- a/library/stdarch/coresimd/x86/sse.rs
+++ b/library/stdarch/coresimd/x86/sse.rs
--- a/library/stdarch/coresimd/x86/sse2.rs
+++ b/library/stdarch/coresimd/x86/sse2.rs
--- a/library/stdarch/coresimd/x86/sse3.rs
+++ b/library/stdarch/coresimd/x86/sse3.rs
@ -9,54 +9,72 @@ use stdsimd_test::assert_instr;

 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
    addsubps(a, b)
 }

 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
    addsubpd(a, b)
 }

 /// Horizontally add adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
    haddpd(a, b)
 }

 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
    haddps(a, b)
 }

 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
    hsubpd(a, b)
 }

 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
    hsubps(a, b)
 }
@ -64,45 +82,60 @@ pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
 /// Load 128-bits of integer data from unaligned memory.
 /// This intrinsic may perform better than `_mm_loadu_si128`
 /// when the data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(lddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
    mem::transmute(lddqu(mem_addr as *const _))
 }

 /// Duplicate the low double-precision (64-bit) floating-point element
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
    simd_shuffle2(a, a, [0, 0])
 }

 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of return vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
    _mm_load1_pd(mem_addr)
 }

 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
    simd_shuffle4(a, a, [1, 1, 3, 3])
 }

 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
    simd_shuffle4(a, a, [0, 0, 2, 2])
 }
--- a/library/stdarch/coresimd/x86/sse41.rs
+++ b/library/stdarch/coresimd/x86/sse41.rs
@ -10,34 +10,47 @@ use stdsimd_test::assert_instr;

 // SSE4 rounding constans
 /// round to nearest
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
 /// round down
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
 /// round up
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
 /// truncate
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
 /// do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
 /// suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
 /// round to nearest and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_NINT: i32 = 0x00;
 /// round down and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_FLOOR: i32 =
    (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
 /// round up and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_CEIL: i32 =
    (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
 /// truncate and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
 /// use MXCSR.RC and do not suppress exceptions; see
 /// `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_RINT: i32 =
    (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_NEARBYINT: i32 =
    (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);

@ -46,9 +59,12 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 /// The high bit of each corresponding mask byte determines the selection.
 /// If the high bit is set the element of `a` is selected. The element
 /// of `b` is selected otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(
    a: __m128i, b: __m128i, mask: __m128i
 ) -> __m128i {
@ -64,10 +80,13 @@ pub unsafe fn _mm_blendv_epi8(
 /// The mask bits determine the selection. A clear bit selects the
 /// corresponding element of `a`, and a set bit the corresponding
 /// element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
    let a = a.as_i16x8();
    let b = b.as_i16x8();
@ -81,28 +100,37 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {

 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
    blendvpd(a, b, mask)
 }

 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
    blendvps(a, b, mask)
 }

 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using control mask `imm2`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
    macro_rules! call {
        ($imm2:expr) => {
@ -114,10 +142,13 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {

 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using mask `imm4`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
    macro_rules! call {
        ($imm4:expr) => {
@ -129,11 +160,14 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {

 /// Extract a single-precision (32-bit) floating-point element from `a`,
 /// selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(test, assert_instr(extractps, imm8 = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
    mem::transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
 }
@ -142,21 +176,27 @@ pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
    let imm8 = (imm8 & 15) as u32;
    simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32
 }

 /// Extract an 32-bit integer from `a` selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(test, assert_instr(extractps, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
    let imm8 = (imm8 & 3) as u32;
    simd_extract::<_, i32>(a.as_i32x4(), imm8)
@ -184,10 +224,13 @@ pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
 ///
 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 /// element is cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
    macro_rules! call {
        ($imm8:expr) => {
@ -199,10 +242,13 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {

 /// Return a copy of `a` with the 8-bit integer from `i` inserted at a
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
    mem::transmute(simd_insert(
        a.as_i8x16(),
@ -213,10 +259,13 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {

 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
    mem::transmute(simd_insert(
        a.as_i32x4(),
@ -227,97 +276,130 @@ pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {

 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
 /// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
 }

 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// maximum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
 }

 /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
 /// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
 }

 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
 }

 /// Compare packed 8-bit integers in `a` and `b` and return packed minimum
 /// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
 }

 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// minimum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
 }

 /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum
 /// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
 }

 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pminud(a.as_u32x4(), b.as_u32x4()))
 }

 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(packusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
 }

 /// Compare packed 64-bit integers in `a` and `b` for equality
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 }

 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
    let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
@ -325,9 +407,12 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 }

 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
    let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
@ -336,9 +421,12 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {

 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 /// 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
    let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
@ -346,9 +434,12 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 }

 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
    let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
@ -356,9 +447,12 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 }

 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
    let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
@ -366,9 +460,12 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 }

 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
    let a = a.as_i32x4();
    let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
@ -376,9 +473,12 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 }

 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
    let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
@ -386,9 +486,12 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 }

 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
    let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
@ -396,9 +499,12 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 }

 /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
    let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
@ -407,9 +513,12 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {

 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
    let a = a.as_u16x8();
    let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
@ -418,9 +527,12 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {

 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
    let a = a.as_u16x8();
    let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
@ -429,9 +541,12 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {

 /// Zero extend packed unsigned 32-bit integers in `a`
 /// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
    let a = a.as_u32x4();
    let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
@ -445,10 +560,13 @@ pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
    macro_rules! call {
        ($imm8:expr) => {
@ -465,10 +583,13 @@ pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
    macro_rules! call {
        ($imm8:expr) => {
@ -481,9 +602,12 @@ pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed double-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
    roundpd(a, _MM_FROUND_FLOOR)
 }
@ -491,9 +615,12 @@ pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed single-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
    roundps(a, _MM_FROUND_FLOOR)
 }
@ -503,9 +630,12 @@ pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper element from `a` to the upper element of the intrinsic
 /// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
    roundsd(a, b, _MM_FROUND_FLOOR)
 }
@ -515,9 +645,12 @@ pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
    roundss(a, b, _MM_FROUND_FLOOR)
 }
@ -525,9 +658,12 @@ pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed double-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
    roundpd(a, _MM_FROUND_CEIL)
 }
@ -535,9 +671,12 @@ pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed single-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
    roundps(a, _MM_FROUND_CEIL)
 }
@ -547,9 +686,12 @@ pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrisic result,
 /// and copy the upper element from `a` to the upper element
 /// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
    roundsd(a, b, _MM_FROUND_CEIL)
 }
@ -559,9 +701,12 @@ pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
    roundss(a, b, _MM_FROUND_CEIL)
 }
@ -602,10 +747,13 @@ pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
    macro_rules! call {
        ($imm4:expr) => {
@ -652,10 +800,13 @@ pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
    macro_rules! call {
        ($imm4:expr) => {
@ -703,10 +854,13 @@ pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
    macro_rules! call {
        ($imm4:expr) => {
@ -754,10 +908,13 @@ pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
    macro_rules! call {
        ($imm4:expr) => {
@ -786,18 +943,24 @@ pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 /// * bits `[18:16]` - contain the index of the minimum value
 /// * remaining bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(phminposuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
    mem::transmute(phminposuw(a.as_u16x8()))
 }

 /// Multiply the low 32-bit integers from each packed 64-bit
 /// element in `a` and `b`, and return the signed 64-bit result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
 }
@ -808,9 +971,12 @@ pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
 /// return a negative number.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 }
@ -846,10 +1012,13 @@ pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// * A `__m128i` vector containing the sums of the sets of
 ///   absolute differences between both operands.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
    let a = a.as_u8x16();
    let b = b.as_u8x16();
@ -874,9 +1043,12 @@ pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
    ptestz(a.as_i64x2(), mask.as_i64x2())
 }
@ -894,9 +1066,12 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all ones,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
    ptestc(a.as_i64x2(), mask.as_i64x2())
 }
@ -914,9 +1089,12 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
    ptestnzc(a.as_i64x2(), mask.as_i64x2())
 }
@ -934,9 +1112,12 @@ pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
    _mm_testz_si128(a, mask)
 }
@ -952,10 +1133,13 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the bits specified in the operand are all set to 1,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
 }
@ -973,9 +1157,12 @@ pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
    _mm_testnzc_si128(a, mask)
 }
--- a/library/stdarch/coresimd/x86/sse42.rs
+++ b/library/stdarch/coresimd/x86/sse42.rs
@ -10,49 +10,68 @@ use coresimd::simd_llvm::*;
 use coresimd::x86::*;

 /// String contains unsigned 8-bit characters *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
 /// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
 /// String contains signed 8-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
 /// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;

 /// For each character in `a`, find if it is in `b` *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
 /// For each character in `a`, determine if
 /// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
 /// The strings defined by `a` and `b` are equal
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
 /// Search for the defined substring in the target
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;

 /// Do not negate results *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
 /// Negate results
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
 /// Do not negate results before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
 /// Negate results only before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;

 /// **Index only**: return the least significant bit *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
 /// **Index only**: return the most significant bit
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;

 /// **Mask only**: return the bit mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
 /// **Mask only**: return the byte mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;

 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrm)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -296,10 +315,13 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpestri`]: fn._mm_cmpestri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistri)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -314,10 +336,13 @@ pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if any character in `b` was null.
 /// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrz)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -332,10 +357,13 @@ pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if the resulting mask was non-zero,
 /// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrc)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -350,10 +378,13 @@ pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and returns `1` if any character in `a` was null,
 /// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrs)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -367,10 +398,13 @@ pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {

 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return bit `0` of the resulting bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistro)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -385,10 +419,13 @@ pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if `b` did not contain a null
 /// character and the resulting mask was zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistra)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
    let a = a.as_i8x16();
    let b = b.as_i8x16();
@ -402,10 +439,13 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {

 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrm(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> __m128i {
@ -506,10 +546,13 @@ pub unsafe fn _mm_cmpestrm(
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpistri`]: fn._mm_cmpistri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestri(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@ -526,10 +569,13 @@ pub unsafe fn _mm_cmpestri(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// `b` was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrz)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrz(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@ -546,10 +592,13 @@ pub unsafe fn _mm_cmpestrz(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if the resulting mask
 /// was non-zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrc)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrc(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@ -566,10 +615,13 @@ pub unsafe fn _mm_cmpestrc(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// a was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrs)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrs(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@ -586,10 +638,13 @@ pub unsafe fn _mm_cmpestrs(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return bit `0` of the resulting
 /// bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestro)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestro(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@ -607,10 +662,13 @@ pub unsafe fn _mm_cmpestro(
 /// using the control in `imm8`, and return `1` if `b` did not
 /// contain a null character and the resulting mask was zero, and `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestra)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestra(
    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@ -626,36 +684,48 @@ pub unsafe fn _mm_cmpestra(

 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 8-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u8)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
    crc32_32_8(crc, v)
 }

 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 16-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u16)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
    crc32_32_16(crc, v)
 }

 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 32-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u32)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
    crc32_32_32(crc, v)
 }

 /// Compare packed 64-bit integers in `a` and `b` for greater-than,
 /// return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi64)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 }
--- a/library/stdarch/coresimd/x86/sse4a.rs
+++ b/library/stdarch/coresimd/x86/sse4a.rs
@ -36,6 +36,7 @@ extern "C" {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(extrq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
    mem::transmute(extrq(x.as_i64x2(), y.as_i8x16()))
 }
@ -52,6 +53,7 @@ pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(insertq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
    mem::transmute(insertq(x.as_i64x2(), y.as_i64x2()))
 }
@ -60,6 +62,7 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
    movntsd(p, a);
 }
@ -68,6 +71,7 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
    movntss(p, a);
 }
--- a/library/stdarch/coresimd/x86/ssse3.rs
+++ b/library/stdarch/coresimd/x86/ssse3.rs
@ -10,9 +10,12 @@ use stdsimd_test::assert_instr;

 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
    mem::transmute(pabsb128(a.as_i8x16()))
 }
@ -20,9 +23,12 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 16-bit signed integers in
 /// `a` and
 /// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
    mem::transmute(pabsw128(a.as_i16x8()))
 }
@ -30,9 +36,12 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 32-bit signed integers in
 /// `a` and
 /// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
    mem::transmute(pabsd128(a.as_i32x4()))
 }
@ -61,19 +70,25 @@ pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///     r
 /// }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
 }

 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
 /// shift the result right by `n` bytes, and return the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
    let n = n as u32;
    // If palignr is shifting the pair of vectors more than the size of two
@ -141,9 +156,12 @@ pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {

 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
 }
@ -151,27 +169,36 @@ pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
 }

 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [4 x i32].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
 }

 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [8 x i16].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
 }
@ -180,18 +207,24 @@ pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// packed 128-bit vectors of [8 x i16]. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
 }

 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [4 x i32].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
 }
@ -201,9 +234,12 @@ pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// integer values contained in the second source operand, add pairs of
 /// contiguous products with signed saturation, and writes the 16-bit sums to
 /// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
 }
@ -211,9 +247,12 @@ pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed 16-bit signed integer values, truncate the 32-bit
 /// product to the 18 most significant bits by right-shifting, round the
 /// truncated value by adding 1, and write bits [16:1] to the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
 }
@ -222,9 +261,12 @@ pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the result.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
 }
@ -233,9 +275,12 @@ pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
 }
@ -244,9 +289,12 @@ pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b`
 /// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
 }
--- a/library/stdarch/coresimd/x86/tbm.rs
+++ b/library/stdarch/coresimd/x86/tbm.rs
@ -70,6 +70,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcfill_u32(x: u32) -> u32 {
    x & (x.wrapping_add(1))
 }
@ -81,6 +82,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcfill_u64(x: u64) -> u64 {
    x & (x.wrapping_add(1))
 }
@ -91,6 +93,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blci_u32(x: u32) -> u32 {
    x | !(x.wrapping_add(1))
 }
@ -102,6 +105,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blci_u64(x: u64) -> u64 {
    x | !(x.wrapping_add(1))
 }
@ -112,6 +116,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcic_u32(x: u32) -> u32 {
    !x & (x.wrapping_add(1))
 }
@ -123,6 +128,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcic_u64(x: u64) -> u64 {
    !x & (x.wrapping_add(1))
 }
@ -134,6 +140,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
    x ^ (x.wrapping_add(1))
 }
@ -146,6 +153,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
    x ^ (x.wrapping_add(1))
 }
@ -156,6 +164,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcs_u32(x: u32) -> u32 {
    x | (x.wrapping_add(1))
 }
@ -167,6 +176,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcs_u64(x: u64) -> u64 {
    x | x.wrapping_add(1)
 }
@ -177,6 +187,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsfill_u32(x: u32) -> u32 {
    x | (x.wrapping_sub(1))
 }
@ -188,6 +199,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsfill_u64(x: u64) -> u64 {
    x | (x.wrapping_sub(1))
 }
@ -198,6 +210,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsic_u32(x: u32) -> u32 {
    !x | (x.wrapping_sub(1))
 }
@ -209,6 +222,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsic_u64(x: u64) -> u64 {
    !x | (x.wrapping_sub(1))
 }
@ -220,6 +234,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
    !x | (x.wrapping_add(1))
 }
@ -232,6 +247,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
    !x | (x.wrapping_add(1))
 }
@ -243,6 +259,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
    !x & (x.wrapping_sub(1))
 }
@ -255,6 +272,7 @@ pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
    !x & (x.wrapping_sub(1))
 }
--- a/library/stdarch/coresimd/x86/xsave.rs
+++ b/library/stdarch/coresimd/x86/xsave.rs
@ -31,9 +31,12 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
    xsave(
        mem_addr,
@ -48,9 +51,12 @@ pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
    xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
@ -58,24 +64,31 @@ pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
 /// `XFEATURE_ENABLED_MASK` for `XCR`
 ///
 /// This intrinsic maps to `XSETBV` instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;

 /// Copy 64-bits from `val` to the extended control register (`XCR`) specified
 /// by `a`.
 ///
 /// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsetbv)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsetbv(a: u32, val: u64) {
    xsetbv(a, (val >> 32) as u32, val as u32);
 }

 /// Reads the contents of the extended control register `XCR`
 /// specified in `xcr_no`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xgetbv)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xgetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
    let eax: u32;
    let edx: u32;
@ -90,9 +103,12 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt)
 #[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
    xsaveopt(
        mem_addr,
@ -107,9 +123,12 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec)
 #[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
    xsavec(
        mem_addr,
@ -125,9 +144,12 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
    xsaves(
        mem_addr,
@ -145,9 +167,12 @@ pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
--- a/library/stdarch/coresimd/x86_64/abm.rs
+++ b/library/stdarch/coresimd/x86_64/abm.rs
@ -23,17 +23,23 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u64)
 #[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
    x.leading_zeros() as u64
 }

 /// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt64)
 #[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _popcnt64(x: i64) -> i32 {
    x.count_ones() as i32
 }
--- a/library/stdarch/coresimd/x86_64/avx.rs
+++ b/library/stdarch/coresimd/x86_64/avx.rs
@ -19,10 +19,13 @@ use mem;

 /// Copy `a` to result, and insert the 64-bit integer `i` into result
 /// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi64)
 #[inline]
 #[rustc_args_required_const(2)]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i {
    mem::transmute(simd_insert(a.as_i64x4(), (index as u32) & 3, i))
 }
--- a/library/stdarch/coresimd/x86_64/avx2.rs
+++ b/library/stdarch/coresimd/x86_64/avx2.rs
@ -22,10 +22,13 @@ use coresimd::simd_llvm::*;
 use coresimd::x86::*;

 /// Extract a 64-bit integer from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[rustc_args_required_const(1)]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
    let imm8 = (imm8 & 3) as u32;
    simd_extract(a.as_i64x4(), imm8)
--- a/library/stdarch/coresimd/x86_64/bmi.rs
+++ b/library/stdarch/coresimd/x86_64/bmi.rs
@ -14,10 +14,13 @@ use stdsimd_test::assert_instr;

 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
    _bextr2_u64(a, ((start & 0xff) | ((len & 0xff) << 8)) as u64)
 }
@ -27,36 +30,48 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
    x86_bmi_bextr_64(a, control)
 }

 /// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
    !a & b
 }

 /// Extract lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsi))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsi_u64(x: u64) -> u64 {
    x & x.wrapping_neg()
 }

 /// Get mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
    x ^ (x.wrapping_sub(1_u64))
 }
@ -64,10 +79,13 @@ pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsr))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsr_u64(x: u64) -> u64 {
    x & (x.wrapping_sub(1))
 }
@ -75,9 +93,12 @@ pub unsafe fn _blsr_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
    x.trailing_zeros() as u64
 }
@ -85,9 +106,12 @@ pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
    x.trailing_zeros() as i64
 }
--- a/library/stdarch/coresimd/x86_64/bmi2.rs
+++ b/library/stdarch/coresimd/x86_64/bmi2.rs
@ -17,10 +17,13 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u64)
 #[inline]
 #[cfg_attr(test, assert_instr(mulx))]
 #[target_feature(enable = "bmi2")]
 #[cfg(not(target_arch = "x86"))] // calls an intrinsic
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
    let result: u128 = (a as u128) * (b as u128);
    *hi = (result >> 64) as u64;
@ -28,30 +31,39 @@ pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
 }

 /// Zero higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u64)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
    x86_bmi2_bzhi_64(a, index as u64)
 }

 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u64)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
    x86_bmi2_pdep_64(a, mask)
 }

 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u64)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
    x86_bmi2_pext_64(a, mask)
 }
--- a/library/stdarch/coresimd/x86_64/bswap.rs
+++ b/library/stdarch/coresimd/x86_64/bswap.rs
@ -6,8 +6,11 @@
 use stdsimd_test::assert_instr;

 /// Return an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap64)
 #[inline]
 #[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bswap64(x: i64) -> i64 {
    bswap_i64(x)
 }
--- a/library/stdarch/coresimd/x86_64/fxsr.rs
+++ b/library/stdarch/coresimd/x86_64/fxsr.rs
@ -21,9 +21,12 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave64)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxsave64(mem_addr: *mut u8) {
    fxsave64(mem_addr)
 }
@ -42,9 +45,12 @@ pub unsafe fn _fxsave64(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor64)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxrstor64(mem_addr: *const u8) {
    fxrstor64(mem_addr)
 }
--- a/library/stdarch/coresimd/x86_64/rdrand.rs
+++ b/library/stdarch/coresimd/x86_64/rdrand.rs
@ -12,10 +12,13 @@ use stdsimd_test::assert_instr;

 /// Read a hardware generated 64-bit random value and store the result in val.
 /// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand64_step)
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
    let (v, flag) = x86_rdrand64_step();
    *val = v;
@ -24,9 +27,12 @@ pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {

 /// Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store
 /// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed64_step)
 #[inline]
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
    let (v, flag) = x86_rdseed64_step();
    *val = v;
--- a/library/stdarch/coresimd/x86_64/sse.rs
+++ b/library/stdarch/coresimd/x86_64/sse.rs
@ -24,9 +24,12 @@ extern "C" {
 /// [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
    cvtss2si64(a)
 }
@ -40,9 +43,12 @@ pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
 /// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
    cvttss2si64(a)
 }
@ -52,9 +58,12 @@ pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
 /// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
    cvtsi642ss(a, b)
 }
--- a/library/stdarch/coresimd/x86_64/sse2.rs
+++ b/library/stdarch/coresimd/x86_64/sse2.rs
@ -17,34 +17,46 @@ extern "C" {

 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 64-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
    cvtsd2si64(a)
 }

 /// Alias for `_mm_cvtsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
    _mm_cvtsd_si64(a)
 }

 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 64-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
    cvttsd2si64(a)
 }

 /// Alias for `_mm_cvttsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
    _mm_cvttsd_si64(a)
 }
@ -52,61 +64,82 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
 /// Stores a 64-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
    intrinsics::nontemporal_store(mem_addr, a);
 }

 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
    _mm_set_epi64x(0, a)
 }

 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
    _mm_cvtsi64_si128(a)
 }

 /// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
    simd_extract(a.as_i64x2(), 0)
 }

 /// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
    _mm_cvtsi128_si64(a)
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
    simd_insert(a, 0, b as f64)
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
    _mm_cvtsi64_sd(a, b)
 }
--- a/library/stdarch/coresimd/x86_64/sse41.rs
+++ b/library/stdarch/coresimd/x86_64/sse41.rs
@ -8,11 +8,14 @@ use mem;
 use stdsimd_test::assert_instr;

 /// Extract an 64-bit integer from `a` selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(test, assert_instr(pextrq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
    let imm8 = (imm8 & 1) as u32;
    simd_extract(a.as_i64x2(), imm8)
@ -20,10 +23,13 @@ pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {

 /// Return a copy of `a` with the 64-bit integer from `i` inserted at a
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
    mem::transmute(simd_insert(a.as_i64x2(), (imm8 & 1) as u32, i))
 }
--- a/library/stdarch/coresimd/x86_64/sse42.rs
+++ b/library/stdarch/coresimd/x86_64/sse42.rs
@ -11,9 +11,12 @@ extern "C" {

 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 64-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u64)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
    crc32_64_64(crc, v)
 }
--- a/library/stdarch/coresimd/x86_64/xsave.rs
+++ b/library/stdarch/coresimd/x86_64/xsave.rs
@ -29,9 +29,12 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave64)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
    xsave64(
        mem_addr,
@ -46,9 +49,12 @@ pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor64)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
    xrstor64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
@ -60,9 +66,12 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE64` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt64)
 #[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
    xsaveopt64(
        mem_addr,
@ -77,9 +86,12 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec64)
 #[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
    xsavec64(
        mem_addr,
@ -95,9 +107,12 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves64)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
    xsaves64(
        mem_addr,
@ -115,9 +130,12 @@ pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors64)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
    xrstors64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
--- a/library/stdarch/crates/stdsimd/src/lib.rs
+++ b/library/stdarch/crates/stdsimd/src/lib.rs
@ -8,7 +8,7 @@
 //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/

 #![feature(const_fn, integer_atomics, staged_api, stdsimd)]
-#![feature(cfg_target_feature, doc_cfg)]
+#![feature(cfg_target_feature, doc_cfg, allow_internal_unstable)]
 #![cfg_attr(feature = "cargo-clippy", allow(shadow_reuse))]
 #![cfg_attr(target_os = "linux", feature(linkage))]
 #![no_std]
--- a/library/stdarch/stdsimd/arch/detect/arch/x86.rs
+++ b/library/stdarch/stdsimd/arch/detect/arch/x86.rs
@ -16,8 +16,70 @@
 //! in a global `AtomicUsize` variable. The query is performed by just checking
 //! whether the feature bit in this global variable is set or cleared.

+/// A macro to test at *runtime* whether a CPU feature is available on
+/// x86/x86-64 platforms.
+///
+/// This macro is provided in the standard library and will detect at runtime
+/// whether the specified CPU feature is detected. This does *not* resolve at
+/// compile time unless the specified feature is already enabled for the entire
+/// crate. Runtime detection currently relies mostly on the `cpuid` instruction.
+///
+/// This macro only takes one argument which is a string literal of the feature
+/// being tested for. The feature names supported are the lowercase versions of
+/// the ones defined by Intel in [their documentation][docs].
+///
+/// ## Supported arguments
+///
+/// This macro supports the same names that `#[target_feature]` supports. Unlike
+/// `#[target_feature]`, however, this macro does not support names separated
+/// with a comma. Instead testing for multiple features must be done through
+/// separate macro invocations for now.
+///
+/// Supported arguments are:
+///
+/// * `"aes"`
+/// * `"pclmulqdq"`
+/// * `"rdrand"`
+/// * `"rdseed"`
+/// * `"tsc"`
+/// * `"mmx"`
+/// * `"sse"`
+/// * `"sse2"`
+/// * `"sse3"`
+/// * `"ssse3"`
+/// * `"sse4.1"`
+/// * `"sse4.2"`
+/// * `"sse4a"`
+/// * `"sha"`
+/// * `"avx"`
+/// * `"avx2"`
+/// * `"avx512f"`
+/// * `"avx512cd"`
+/// * `"avx512er"`
+/// * `"avx512pf"`
+/// * `"avx512bw"`
+/// * `"avx512dq"`
+/// * `"avx512vl"`
+/// * `"avx512ifma"`
+/// * `"avx512vbmi"`
+/// * `"avx512vpopcntdq"`
+/// * `"fma"`
+/// * `"bmi1"`
+/// * `"bmi2"`
+/// * `"abm"`
+/// * `"lzcnt"`
+/// * `"tbm"`
+/// * `"popcnt"`
+/// * `"fxsr"`
+/// * `"xsave"`
+/// * `"xsaveopt"`
+/// * `"xsaves"`
+/// * `"xsavec"`
+///
+/// [docs]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
 #[macro_export]
-#[unstable(feature = "stdsimd", issue = "0")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow_internal_unstable]
 macro_rules! is_x86_feature_detected {
    ("aes") => {
        cfg!(target_feature = "aes") || $crate::arch::detect::check_for(
--- a/library/stdarch/stdsimd/mod.rs
+++ b/library/stdarch/stdsimd/mod.rs
@ -343,30 +343,38 @@
 ///     }
 /// }
 /// ```
-#[unstable(feature = "stdsimd", issue = "0")]
+#[stable(feature = "simd_arch", since = "1.27.0")]
 pub mod arch {
    #[cfg(all(not(dox), target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub use coresimd::arch::x86;

    #[cfg(all(not(dox), target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub use coresimd::arch::x86_64;

    #[cfg(all(not(dox), target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub use coresimd::arch::arm;

    #[cfg(all(not(dox), target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub use coresimd::arch::aarch64;

    #[cfg(target_arch = "wasm32")]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub use coresimd::arch::wasm32;

    #[cfg(all(not(dox), target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub use coresimd::arch::mips;

    #[cfg(all(not(dox), target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub use coresimd::arch::mips64;

    #[doc(hidden)] // unstable implementation detail
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod detect;

    /// Platform-specific intrinsics for the `x86` platform.
@ -378,6 +386,7 @@ pub mod arch {
    /// [libcore]: ../../../core/arch/x86/index.html
    #[cfg(dox)]
    #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub mod x86 {}

    /// Platform-specific intrinsics for the `x86_64` platform.
@ -389,6 +398,7 @@ pub mod arch {
    /// [libcore]: ../../../core/arch/x86_64/index.html
    #[cfg(dox)]
    #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
    pub mod x86_64 {}

    /// Platform-specific intrinsics for the `arm` platform.
@ -400,6 +410,7 @@ pub mod arch {
    /// [libcore]: ../../../core/arch/arm/index.html
    #[cfg(dox)]
    #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod arm {}

    /// Platform-specific intrinsics for the `aarch64` platform.
@ -411,6 +422,7 @@ pub mod arch {
    /// [libcore]: ../../../core/arch/aarch64/index.html
    #[cfg(dox)]
    #[doc(cfg(target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod aarch64 {}

    /// Platform-specific intrinsics for the `mips` platform.
@ -422,6 +434,7 @@ pub mod arch {
    /// [libcore]: ../../../core/arch/mips/index.html
    #[cfg(dox)]
    #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod mips {}

    /// Platform-specific intrinsics for the `mips64` platform.
@ -433,6 +446,7 @@ pub mod arch {
    /// [libcore]: ../../../core/arch/mips64/index.html
    #[cfg(dox)]
    #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
    pub mod mips64 {}
 }