Auto merge of #148425 - matthiaskrgr:rollup-pwibmxt, r=matthiaskrgr

Rollup of 3 pull requests Successful merges: - rust-lang/rust#146260 (add SliceIndex wrapper types Last and Clamp<Idx>) - rust-lang/rust#148394 (Make explicit that `TypeId`'s layout and size are unstable) - rust-lang/rust#148402 (stdarch subtree update) r? `@ghost` `@rustbot` modify labels: rollup
2025-11-03 06:36:43 +00:00 · 2025-11-03 06:36:43 +00:00 · f2bae990e8
commit f2bae990e8
parent 7878a91944 5f4a90e7a7
82 changed files with 13750 additions and 1086 deletions
--- a/library/core/src/any.rs
+++ b/library/core/src/any.rs
@ -611,6 +611,15 @@ impl dyn Any + Send + Sync {
 /// noting that the hashes and ordering will vary between Rust releases. Beware
 /// of relying on them inside of your code!
 ///
+/// # Layout
+///
+/// Like other [`Rust`-representation][repr-rust] types, `TypeId`'s size and layout are unstable.
+/// In particular, this means that you cannot rely on the size and layout of `TypeId` remaining the
+/// same between Rust releases; they are subject to change without prior notice between Rust
+/// releases.
+///
+/// [repr-rust]: https://doc.rust-lang.org/reference/type-layout.html#r-layout.repr.rust.unspecified
+///
 /// # Danger of Improper Variance
 ///
 /// You might think that subtyping is impossible between two static types,
--- a/library/core/src/hint.rs
+++ b/library/core/src/hint.rs
@ -271,11 +271,11 @@ pub fn spin_loop() {
    crate::cfg_select! {
        target_arch = "x86" => {
            // SAFETY: the `cfg` attr ensures that we only execute this on x86 targets.
-            unsafe { crate::arch::x86::_mm_pause() }
+            crate::arch::x86::_mm_pause()
        }
        target_arch = "x86_64" => {
            // SAFETY: the `cfg` attr ensures that we only execute this on x86_64 targets.
-            unsafe { crate::arch::x86_64::_mm_pause() }
+            crate::arch::x86_64::_mm_pause()
        }
        target_arch = "riscv32" => crate::arch::riscv32::pause(),
        target_arch = "riscv64" => crate::arch::riscv64::pause(),
--- a/library/core/src/index.rs
+++ b/library/core/src/index.rs
@ -0,0 +1,472 @@
+#![unstable(feature = "sliceindex_wrappers", issue = "146179")]
+
+//! Helper types for indexing slices.
+
+use crate::intrinsics::slice_get_unchecked;
+use crate::slice::SliceIndex;
+use crate::{cmp, ops, range};
+
+/// Clamps an index, guaranteeing that it will only access valid elements of the slice.
+///
+/// # Examples
+///
+/// ```
+/// #![feature(sliceindex_wrappers)]
+///
+/// use core::index::Clamp;
+///
+/// let s: &[usize] = &[0, 1, 2, 3];
+///
+/// assert_eq!(&3, &s[Clamp(6)]);
+/// assert_eq!(&[1, 2, 3], &s[Clamp(1..6)]);
+/// assert_eq!(&[] as &[usize], &s[Clamp(5..6)]);
+/// assert_eq!(&[0, 1, 2, 3], &s[Clamp(..6)]);
+/// assert_eq!(&[0, 1, 2, 3], &s[Clamp(..=6)]);
+/// assert_eq!(&[] as &[usize], &s[Clamp(6..)]);
+/// ```
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+#[derive(Debug)]
+pub struct Clamp<Idx>(pub Idx);
+
+/// Always accesses the last element of the slice.
+///
+/// # Examples
+///
+/// ```
+/// #![feature(sliceindex_wrappers)]
+/// #![feature(slice_index_methods)]
+///
+/// use core::index::Last;
+/// use core::slice::SliceIndex;
+///
+/// let s = &[0, 1, 2, 3];
+///
+/// assert_eq!(&3, &s[Last]);
+/// assert_eq!(None, Last.get(&[] as &[usize]));
+///
+/// ```
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+#[derive(Debug)]
+pub struct Last;
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<usize> {
+    type Output = T;
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        slice.get(cmp::min(self.0, slice.len() - 1))
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        slice.get_mut(cmp::min(self.0, slice.len() - 1))
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { slice_get_unchecked(slice, cmp::min(self.0, slice.len() - 1)) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { slice_get_unchecked(slice, cmp::min(self.0, slice.len() - 1)) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        &(*slice)[cmp::min(self.0, slice.len() - 1)]
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        &mut (*slice)[cmp::min(self.0, slice.len() - 1)]
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<range::Range<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        // SAFETY: a range ending before len is always valid
+        unsafe { (start..end).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        // SAFETY: a range ending before len is always valid
+        unsafe { (start..end).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<ops::Range<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        // SAFETY: a range ending before len is always valid
+        unsafe { (start..end).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        // SAFETY: a range ending before len is always valid
+        unsafe { (start..end).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len());
+        let end = cmp::min(self.0.end, slice.len());
+        (start..end).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeInclusive<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.last, slice.len() - 1);
+        (start..=end).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.last, slice.len() - 1);
+        (start..=end).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.last, slice.len() - 1);
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (start..=end).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.last, slice.len() - 1);
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (start..=end).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.last, slice.len() - 1);
+        (start..=end).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.last, slice.len() - 1);
+        (start..=end).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<ops::RangeInclusive<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.end, slice.len() - 1);
+        (start..=end).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.end, slice.len() - 1);
+        (start..=end).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.end, slice.len() - 1);
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (start..=end).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.end, slice.len() - 1);
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (start..=end).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.end, slice.len() - 1);
+        (start..=end).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        let start = cmp::min(self.0.start, slice.len() - 1);
+        let end = cmp::min(self.0.end, slice.len() - 1);
+        (start..=end).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeFrom<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        (cmp::min(self.0.start, slice.len())..).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        (cmp::min(self.0.start, slice.len())..).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: a range starting at len is valid
+        unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: a range starting at len is valid
+        unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        (cmp::min(self.0.start, slice.len())..).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        (cmp::min(self.0.start, slice.len())..).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<ops::RangeFrom<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        (cmp::min(self.0.start, slice.len())..).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        (cmp::min(self.0.start, slice.len())..).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: a range starting at len is valid
+        unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: a range starting at len is valid
+        unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        (cmp::min(self.0.start, slice.len())..).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        (cmp::min(self.0.start, slice.len())..).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeTo<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        (..cmp::min(self.0.end, slice.len())).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        (..cmp::min(self.0.end, slice.len())).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: a range ending before len is always valid
+        unsafe { (..cmp::min(self.0.end, slice.len())).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: a range ending before len is always valid
+        unsafe { (..cmp::min(self.0.end, slice.len())).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        (..cmp::min(self.0.end, slice.len())).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        (..cmp::min(self.0.end, slice.len())).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeToInclusive<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        (..=cmp::min(self.0.last, slice.len() - 1)).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        (..=cmp::min(self.0.last, slice.len() - 1)).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (..=cmp::min(self.0.last, slice.len() - 1)).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (..=cmp::min(self.0.last, slice.len() - 1)).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        (..=cmp::min(self.0.last, slice.len() - 1)).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        (..=cmp::min(self.0.last, slice.len() - 1)).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<ops::RangeToInclusive<usize>> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        (..=cmp::min(self.0.end, slice.len() - 1)).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        (..=cmp::min(self.0.end, slice.len() - 1)).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (..=cmp::min(self.0.end, slice.len() - 1)).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { (..=cmp::min(self.0.end, slice.len() - 1)).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        (..=cmp::min(self.0.end, slice.len() - 1)).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        (..=cmp::min(self.0.end, slice.len() - 1)).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeFull> {
+    type Output = [T];
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        (..).get(slice)
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        (..).get_mut(slice)
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: RangeFull just returns `slice` here
+        unsafe { (..).get_unchecked(slice) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: RangeFull just returns `slice` here
+        unsafe { (..).get_unchecked_mut(slice) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        (..).index(slice)
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        (..).index_mut(slice)
+    }
+}
+
+#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+unsafe impl<T> SliceIndex<[T]> for Last {
+    type Output = T;
+
+    fn get(self, slice: &[T]) -> Option<&Self::Output> {
+        slice.last()
+    }
+
+    fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
+        slice.last_mut()
+    }
+
+    unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { slice_get_unchecked(slice, slice.len() - 1) }
+    }
+
+    unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
+        // SAFETY: the caller ensures that the slice isn't empty
+        unsafe { slice_get_unchecked(slice, slice.len() - 1) }
+    }
+
+    fn index(self, slice: &[T]) -> &Self::Output {
+        // N.B., use intrinsic indexing
+        &(*slice)[slice.len() - 1]
+    }
+
+    fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
+        // N.B., use intrinsic indexing
+        &mut (*slice)[slice.len() - 1]
+    }
+}
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@ -294,6 +294,7 @@ pub mod cmp;
 pub mod convert;
 pub mod default;
 pub mod error;
+pub mod index;
 pub mod marker;
 pub mod ops;

--- a/library/core/src/range.rs
+++ b/library/core/src/range.rs
@ -650,6 +650,18 @@ impl<Idx: PartialOrd<Idx>> RangeToInclusive<Idx> {
    }
 }

+impl<T> From<legacy::RangeToInclusive<T>> for RangeToInclusive<T> {
+    fn from(value: legacy::RangeToInclusive<T>) -> Self {
+        Self { last: value.end }
+    }
+}
+
+impl<T> From<RangeToInclusive<T>> for legacy::RangeToInclusive<T> {
+    fn from(value: RangeToInclusive<T>) -> Self {
+        Self { end: value.last }
+    }
+}
+
 // RangeToInclusive<Idx> cannot impl From<RangeTo<Idx>>
 // because underflow would be possible with (..0).into()

--- a/library/core/src/slice/index.rs
+++ b/library/core/src/slice/index.rs
@ -135,6 +135,11 @@ mod private_slice_index {
    impl Sealed for range::RangeFrom<usize> {}

    impl Sealed for ops::IndexRange {}
+
+    #[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+    impl Sealed for crate::index::Last {}
+    #[unstable(feature = "sliceindex_wrappers", issue = "146179")]
+    impl<T> Sealed for crate::index::Clamp<T> where T: Sealed {}
 }

 /// A helper trait used for indexing operations.
--- a/library/coretests/tests/index.rs
+++ b/library/coretests/tests/index.rs
@ -0,0 +1,83 @@
+use core::index::Clamp;
+use core::range;
+use core::slice::SliceIndex;
+
+macro_rules! test_clamp {
+    ($range:expr, $(($slice:expr, $other:expr)),+) => {
+        $(
+            assert_eq!(Clamp($range.clone()).get(&$slice as &[_]), $other.get(&$slice as &[_]));
+            assert_eq!(Clamp($range.clone()).get_mut(&mut $slice as &mut [_]), $other.get_mut(&mut $slice as &mut [_]));
+            unsafe {
+                assert_eq!(&*Clamp($range.clone()).get_unchecked(&$slice as &[_]), &*$other.get_unchecked(&$slice as &[_]));
+                assert_eq!(&*Clamp($range.clone()).get_unchecked_mut(&mut $slice as &mut [_]), &*$other.get_unchecked_mut(&mut $slice as &mut [_]));
+            }
+            assert_eq!(Clamp($range.clone()).index(&$slice as &[_]), $other.index(&$slice as &[_]));
+            assert_eq!(Clamp($range.clone()).index_mut(&mut $slice as &mut [_]), $other.index_mut(&mut $slice as &mut [_]));
+        )+
+    };
+}
+
+#[test]
+fn test_clamp_usize() {
+    test_clamp!(2, ([0, 1], 1), ([0, 1, 2], 2));
+}
+
+#[test]
+fn test_clamp_range_range() {
+    test_clamp!(range::Range::from(1..4), ([0, 1], 1..2), ([0, 1, 2, 3, 4], 1..4), ([0], 1..1));
+}
+
+#[test]
+fn test_clamp_ops_range() {
+    test_clamp!(1..4, ([0, 1], 1..2), ([0, 1, 2, 3, 4], 1..4), ([0], 1..1));
+}
+
+#[test]
+fn test_clamp_range_range_inclusive() {
+    test_clamp!(
+        range::RangeInclusive::from(1..=3),
+        ([0, 1], 1..=1),
+        ([0, 1, 2, 3, 4], 1..=3),
+        ([0], 0..=0)
+    );
+}
+
+#[test]
+fn test_clamp_ops_range_inclusive() {
+    test_clamp!(1..=3, ([0, 1], 1..=1), ([0, 1, 2, 3, 4], 1..=3), ([0], 0..=0));
+}
+
+#[test]
+fn test_clamp_range_range_from() {
+    test_clamp!(range::RangeFrom::from(1..), ([0, 1], 1..), ([0, 1, 2, 3, 4], 1..), ([0], 1..));
+}
+
+#[test]
+fn test_clamp_ops_range_from() {
+    test_clamp!(1.., ([0, 1], 1..), ([0, 1, 2, 3, 4], 1..), ([0], 1..));
+}
+
+#[test]
+fn test_clamp_range_to() {
+    test_clamp!(..4, ([0, 1], ..2), ([0, 1, 2, 3, 4], ..4), ([0], ..1));
+}
+
+#[test]
+fn test_clamp_range_range_to_inclusive() {
+    test_clamp!(
+        range::RangeToInclusive::from(..=4),
+        ([0, 1], ..=1),
+        ([0, 1, 2, 3, 4], ..=4),
+        ([0], ..=0)
+    );
+}
+
+#[test]
+fn test_clamp_ops_range_to_inclusive() {
+    test_clamp!(..=4, ([0, 1], ..=1), ([0, 1, 2, 3, 4], ..=4), ([0], ..=0));
+}
+
+#[test]
+fn test_clamp_range_full() {
+    test_clamp!(.., ([0, 1], ..), ([0, 1, 2, 3, 4], ..), ([0], ..));
+}
--- a/library/coretests/tests/lib.rs
+++ b/library/coretests/tests/lib.rs
@ -85,6 +85,7 @@
 #![feature(maybe_uninit_write_slice)]
 #![feature(min_specialization)]
 #![feature(never_type)]
+#![feature(new_range_api)]
 #![feature(next_index)]
 #![feature(non_exhaustive_omitted_patterns_lint)]
 #![feature(numfmt)]
@ -97,9 +98,11 @@
 #![feature(ptr_metadata)]
 #![feature(result_option_map_or_default)]
 #![feature(slice_from_ptr_range)]
+#![feature(slice_index_methods)]
 #![feature(slice_internals)]
 #![feature(slice_partition_dedup)]
 #![feature(slice_split_once)]
+#![feature(sliceindex_wrappers)]
 #![feature(split_array)]
 #![feature(split_as_slice)]
 #![feature(std_internals)]
@ -178,6 +181,7 @@ mod fmt;
 mod future;
 mod hash;
 mod hint;
+mod index;
 mod intrinsics;
 mod io;
 mod iter;
--- a/library/stdarch/.github/workflows/main.yml
+++ b/library/stdarch/.github/workflows/main.yml
@ -249,6 +249,43 @@ jobs:
      env:
        TARGET: ${{ matrix.target.tuple }}

+  intrinsic-test:
+    needs: [style]
+    name: Intrinsic Test
+    runs-on: ubuntu-latest 
+    strategy:
+      matrix:
+        target:
+          - aarch64-unknown-linux-gnu
+          - aarch64_be-unknown-linux-gnu
+          - armv7-unknown-linux-gnueabihf
+          - arm-unknown-linux-gnueabihf
+          - x86_64-unknown-linux-gnu
+        profile: [dev, release]
+        include:
+          - target: aarch64_be-unknown-linux-gnu
+            build_std: true
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Rust
+      run: |
+        rustup update nightly --no-self-update
+        rustup default nightly
+    - run: rustup target add ${{ matrix.target }}
+      if: ${{ (matrix.build_std || false) == false }}
+    - run: |
+        rustup component add rust-src
+        echo "CARGO_UNSTABLE_BUILD_STD=std" >> $GITHUB_ENV
+      if: ${{ matrix.build_std }}
+
+    # Configure some env vars based on matrix configuration
+    - run: echo "PROFILE=--profile=${{ matrix.profile }}" >> $GITHUB_ENV
+    - run: ./ci/intrinsic-test-docker.sh ${{ matrix.target }}
+      if: ${{ !startsWith(matrix.target, 'thumb') }}
+      env:
+        TARGET: ${{ matrix.target }}
+
  # Check that the generated files agree with the checked-in versions.
  check-stdarch-gen:
    needs: [style]
@ -276,6 +313,7 @@ jobs:
      - docs
      - verify
      - test
+      - intrinsic-test
      - check-stdarch-gen
    runs-on: ubuntu-latest
    # We need to ensure this job does *not* get skipped if its dependencies fail,
--- a/library/stdarch/Cargo.lock
+++ b/library/stdarch/Cargo.lock
@ -347,8 +347,11 @@ dependencies = [
 "itertools",
 "log",
 "pretty_env_logger",
+ "quick-xml 0.37.5",
 "rayon",
+ "regex",
 "serde",
+ "serde-xml-rs",
 "serde_json",
 ]

@ -404,9 +407,9 @@ checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"

 [[package]]
 name = "memchr"
-version = "2.7.5"
+version = "2.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"

 [[package]]
 name = "once_cell_polyfill"
@ -452,6 +455,16 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "quick-xml"
+version = "0.37.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
 [[package]]
 name = "quickcheck"
 version = "1.0.3"
@ -587,6 +600,18 @@ dependencies = [
 "serde_derive",
 ]

+[[package]]
+name = "serde-xml-rs"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53630160a98edebde0123eb4dfd0fce6adff091b2305db3154a9e920206eb510"
+dependencies = [
+ "log",
+ "serde",
+ "thiserror",
+ "xml-rs",
+]
+
 [[package]]
 name = "serde_derive"
 version = "1.0.219"
@ -698,7 +723,7 @@ name = "stdarch-verify"
 version = "0.1.0"
 dependencies = [
 "proc-macro2",
- "quick-xml",
+ "quick-xml 0.33.0",
 "quote",
 "serde",
 "serde_json",
@ -746,6 +771,26 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.18"
@ -958,6 +1003,12 @@ version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"

+[[package]]
+name = "xml-rs"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7"
+
 [[package]]
 name = "yaml-rust"
 version = "0.4.5"
--- a/library/stdarch/ci/docker/wasm32-wasip1/Dockerfile
+++ b/library/stdarch/ci/docker/wasm32-wasip1/Dockerfile
@ -7,9 +7,9 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
  xz-utils \
  clang

-ENV VERSION=v34.0.1
+ENV VERSION=v38.0.3

 RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/${VERSION}/wasmtime-${VERSION}-x86_64-linux.tar.xz | tar xJf -
 ENV PATH=$PATH:/wasmtime-${VERSION}-x86_64-linux

-ENV CARGO_TARGET_WASM32_WASIP1_RUNNER="wasmtime --dir /checkout/target/wasm32-wasip1/release/deps::."
+ENV CARGO_TARGET_WASM32_WASIP1_RUNNER="wasmtime -Wexceptions --dir /checkout/target/wasm32-wasip1/release/deps::."
--- a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@ -6,7 +6,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
  make \
  ca-certificates \
  wget \
-  xz-utils
+  xz-utils \
+  clang \
+  libstdc++-14-dev \
+  build-essential \
+  lld

 RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.58.0-2025-06-16-lin.tar.xz -O sde.tar.xz
 RUN mkdir intel-sde
@ -14,5 +18,6 @@ RUN tar -xJf sde.tar.xz --strip-components=1 -C intel-sde
 ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/intel-sde/sde64 \
            -cpuid-in /checkout/ci/docker/x86_64-unknown-linux-gnu/cpuid.def \
            -rtm-mode full -tsx --"
-# These tests fail with SDE as it doesn't support saving register data
-ENV STDARCH_TEST_SKIP_FUNCTION="xsave,xsaveopt,xsave64,xsaveopt64"
+# SDE doesn't support AMD extensions
+# FIXME: find a way to test these
+ENV STDARCH_TEST_SKIP_FEATURE="sse4a,tbm,xop"
--- a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
@ -12,7 +12,7 @@
 # CPUID_VERSION = 1.0
 #      Input      =>               Output
 # EAX      ECX    =>   EAX      EBX      ECX      EDX
-00000000 ******** => 00000024 68747541 444d4163 69746e65
+00000000 ******** => 00000024 756e6547 6c65746e 49656e69
 00000001 ******** => 00400f10 00100800 7ffaf3ff bfebfbff
 00000002 ******** => 76035a01 00f0b6ff 00000000 00c10000
 00000003 ******** => 00000000 00000000 00000000 00000000
@ -49,7 +49,7 @@
 00000024 00000000 => 00000001 00070002 00000000 00000000 #AVX10
 00000024 00000001 => 00000000 00000000 00000004 00000000
 80000000 ******** => 80000008 00000000 00000000 00000000
-80000001 ******** => 00000000 00000000 00200961 2c100000
+80000001 ******** => 00000000 00000000 00000121 2c100000
 80000002 ******** => 00000000 00000000 00000000 00000000
 80000003 ******** => 00000000 00000000 00000000 00000000
 80000004 ******** => 00000000 00000000 00000000 00000000
@ -59,5 +59,4 @@
 80000008 ******** => 00003028 00000200 00000200 00000000

 # This file was copied from intel-sde/misc/cpuid/future/cpuid.def, and modified to
-# use "AuthenticAMD" as the vendor and the support for `XOP`, `SSE4a`, `TBM`,
-# `AVX512_VP2INTERSECT` and the VEX variants of AVX512 was added in the CPUID.
+# add support for `AVX512_VP2INTERSECT`
--- a/library/stdarch/ci/intrinsic-test-docker.sh
+++ b/library/stdarch/ci/intrinsic-test-docker.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env sh
+
+# Small script to run tests for a target (or all targets) inside all the
+# respective docker images.
+
+set -ex
+
+if [ $# -lt 1 ]; then
+    >&2 echo "Usage: $0 <TARGET>"
+    exit 1
+fi
+
+run() {
+    # Set the linker that is used for the host (e.g. when compiling a build.rs)
+    # This overrides any configuration in e.g. `.cargo/config.toml`, which will
+    # probably not work within the docker container.
+    HOST_LINKER="CARGO_TARGET_$(rustc --print host-tuple | tr '[:lower:]-' '[:upper:]_')_LINKER"
+
+    # Prevent `Read-only file system (os error 30)`.
+    cargo generate-lockfile
+
+    echo "Building docker container for TARGET=${1}"
+    docker build -t stdarch -f "ci/docker/${1}/Dockerfile" ci/
+    mkdir -p target c_programs rust_programs
+    echo "Running docker"
+    # shellcheck disable=SC2016
+    docker run \
+      --rm \
+      --user "$(id -u)":"$(id -g)" \
+      --env CARGO_HOME=/cargo \
+      --env CARGO_TARGET_DIR=/checkout/target \
+      --env TARGET="${1}" \
+      --env "${HOST_LINKER}"="cc" \
+      --env STDARCH_DISABLE_ASSERT_INSTR \
+      --env NOSTD \
+      --env NORUN \
+      --env RUSTFLAGS \
+      --env CARGO_UNSTABLE_BUILD_STD \
+      --volume "${HOME}/.cargo":/cargo \
+      --volume "$(rustc --print sysroot)":/rust:ro \
+      --volume "$(pwd)":/checkout:ro \
+      --volume "$(pwd)"/target:/checkout/target \
+      --volume "$(pwd)"/c_programs:/checkout/c_programs \
+      --volume "$(pwd)"/rust_programs:/checkout/rust_programs \
+      --init \
+      --workdir /checkout \
+      --privileged \
+      stdarch \
+      sh -c "HOME=/tmp PATH=\$PATH:/rust/bin exec ci/intrinsic-test.sh ${1}"
+}
+
+if [ -z "$1" ]; then
+  >&2 echo "No target specified!"
+  exit 1
+else
+  run "${1}"
+fi
--- a/library/stdarch/ci/intrinsic-test.sh
+++ b/library/stdarch/ci/intrinsic-test.sh
@ -0,0 +1,123 @@
+#!/usr/bin/env sh
+
+set -ex
+
+: "${TARGET?The TARGET environment variable must be set.}"
+
+export RUSTFLAGS="${RUSTFLAGS} -D warnings -Z merge-functions=disabled -Z verify-llvm-ir"
+export HOST_RUSTFLAGS="${RUSTFLAGS}"
+export PROFILE="${PROFILE:="--profile=release"}"
+
+case ${TARGET} in
+    # On 32-bit use a static relocation model which avoids some extra
+    # instructions when dealing with static data, notably allowing some
+    # instruction assertion checks to pass below the 20 instruction limit. If
+    # this is the default, dynamic, then too many instructions are generated
+    # when we assert the instruction for a function and it causes tests to fail.
+    i686-* | i586-*)
+        export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static"
+        ;;
+    # Some x86_64 targets enable by default more features beyond SSE2,
+    # which cause some instruction assertion checks to fail.
+    x86_64-*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=-sse3"
+        ;;
+    #Unoptimized build uses fast-isel which breaks with msa
+    mips-* | mipsel-*)
+	export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
+	;;
+    armv7-*eabihf | thumbv7-*eabihf)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
+        ;;
+    # Some of our test dependencies use the deprecated `gcc` crates which
+    # doesn't detect RISC-V compilers automatically, so do it manually here.
+    riscv*)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zks,+zbb,+zbc"
+        ;;
+esac
+
+echo "RUSTFLAGS=${RUSTFLAGS}"
+echo "OBJDUMP=${OBJDUMP}"
+echo "PROFILE=${PROFILE}"
+
+INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"
+
+# Test targets compiled with extra features.
+case ${TARGET} in
+    # Setup aarch64 & armv7 specific variables, the runner, along with some
+    # tests to skip
+    aarch64-unknown-linux-gnu*)
+        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
+        TEST_CXX_COMPILER="clang++"
+        TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}"
+        ;;
+
+    aarch64_be-unknown-linux-gnu*)
+        TEST_CPPFLAGS="-fuse-ld=lld"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
+        TEST_CXX_COMPILER="clang++"
+        TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}"
+        ;;
+
+    armv7-unknown-linux-gnueabihf*)
+        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt
+        TEST_CXX_COMPILER="clang++"
+        TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}"
+        ;;
+
+    x86_64-unknown-linux-gnu*)
+        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/"
+        TEST_CXX_COMPILER="clang++"
+        TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}"
+        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt
+        TEST_SAMPLE_INTRINSICS_PERCENTAGE=5
+        ;;
+    *)
+        ;;
+
+esac
+
+# Arm specific
+case "${TARGET}" in
+    aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*)
+        CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
+            cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
+            --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
+            --runner "${TEST_RUNNER}" \
+            --cppcompiler "${TEST_CXX_COMPILER}" \
+            --skip "${TEST_SKIP_INTRINSICS}" \
+            --target "${TARGET}"
+        ;;
+
+    aarch64_be-unknown-linux-gnu*)
+        CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
+            cargo run "${INTRINSIC_TEST}" "${PROFILE}"  \
+            --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
+            --runner "${TEST_RUNNER}" \
+            --cppcompiler "${TEST_CXX_COMPILER}" \
+            --skip "${TEST_SKIP_INTRINSICS}" \
+            --target "${TARGET}" \
+            --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
+            --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
+        ;;
+
+    x86_64-unknown-linux-gnu*)
+        # `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER` is not necessary for `intrinsic-test`
+        # because the binary needs to run directly on the host.
+        # Hence the use of `env -u`.
+        env -u CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER \
+            CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" \
+            RUST_LOG=warn RUST_BACKTRACE=1 \
+            cargo run "${INTRINSIC_TEST}" "${PROFILE}"  \
+            --bin intrinsic-test -- intrinsics_data/x86-intel.xml \
+            --runner "${TEST_RUNNER}" \
+            --skip "${TEST_SKIP_INTRINSICS}" \
+            --cppcompiler "${TEST_CXX_COMPILER}" \
+            --target "${TARGET}" \
+            --sample-percentage "${TEST_SAMPLE_INTRINSICS_PERCENTAGE}"
+        ;;
+     *)
+        ;;
+esac
--- a/library/stdarch/ci/run.sh
+++ b/library/stdarch/ci/run.sh
@ -79,7 +79,6 @@ cargo_test() {

 CORE_ARCH="--manifest-path=crates/core_arch/Cargo.toml"
 STDARCH_EXAMPLES="--manifest-path=examples/Cargo.toml"
-INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"

 cargo_test "${CORE_ARCH} ${PROFILE}"

@ -130,61 +129,11 @@ case ${TARGET} in
        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+altivec"
        cargo_test "${PROFILE}"
        ;;
-
-    # Setup aarch64 & armv7 specific variables, the runner, along with some
-    # tests to skip
-    aarch64-unknown-linux-gnu*)
-        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
-        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
-        TEST_CXX_COMPILER="clang++"
-        TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}"
-        ;;
-
-    aarch64_be-unknown-linux-gnu*)
-        TEST_CPPFLAGS="-fuse-ld=lld"
-        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
-        TEST_CXX_COMPILER="clang++"
-        TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}"
-        ;;
-
-    armv7-unknown-linux-gnueabihf*)
-        TEST_CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
-        TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt
-        TEST_CXX_COMPILER="clang++"
-        TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}"
-        ;;
    *)
        ;;

 esac

-# Arm specific
-case "${TARGET}" in
-    aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*)
-        CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
-            cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
-            --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
-            --runner "${TEST_RUNNER}" \
-            --cppcompiler "${TEST_CXX_COMPILER}" \
-            --skip "${TEST_SKIP_INTRINSICS}" \
-            --target "${TARGET}"
-        ;;
-
-    aarch64_be-unknown-linux-gnu*)
-        CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
-            cargo run "${INTRINSIC_TEST}" "${PROFILE}"  \
-            --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
-            --runner "${TEST_RUNNER}" \
-            --cppcompiler "${TEST_CXX_COMPILER}" \
-            --skip "${TEST_SKIP_INTRINSICS}" \
-            --target "${TARGET}" \
-            --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
-            --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
-        ;;
-     *)
-        ;;
-esac
-
 if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
    # Test examples
    (
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
@ -44,9 +44,14 @@ use crate::arch::asm;
 #[inline]
 #[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
 pub fn pause() {
+    // Use `.option` directives to expose this HINT instruction
+    // (no-op if not supported by the hardware) without `#[target_feature]`.
    unsafe {
        asm!(
-            ".insn i 0x0F, 0, x0, x0, 0x010",
+            ".option push",
+            ".option arch, +zihintpause",
+            "pause",
+            ".option pop",
            options(nomem, nostack, preserves_flags)
        );
    }
--- a/library/stdarch/crates/core_arch/src/s390x/vector.rs
+++ b/library/stdarch/crates/core_arch/src/s390x/vector.rs
@ -60,26 +60,6 @@ struct PackedTuple<T, U> {
 #[allow(improper_ctypes)]
 #[rustfmt::skip]
 unsafe extern "unadjusted" {
-    #[link_name = "llvm.smax.v16i8"] fn vmxb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
-    #[link_name = "llvm.smax.v8i16"] fn vmxh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
-    #[link_name = "llvm.smax.v4i32"] fn vmxf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
-    #[link_name = "llvm.smax.v2i64"] fn vmxg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
-
-    #[link_name = "llvm.umax.v16i8"] fn vmxlb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
-    #[link_name = "llvm.umax.v8i16"] fn vmxlh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
-    #[link_name = "llvm.umax.v4i32"] fn vmxlf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
-    #[link_name = "llvm.umax.v2i64"] fn vmxlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
-
-    #[link_name = "llvm.smin.v16i8"] fn vmnb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
-    #[link_name = "llvm.smin.v8i16"] fn vmnh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
-    #[link_name = "llvm.smin.v4i32"] fn vmnf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
-    #[link_name = "llvm.smin.v2i64"] fn vmng(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
-
-    #[link_name = "llvm.umin.v16i8"] fn vmnlb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
-    #[link_name = "llvm.umin.v8i16"] fn vmnlh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
-    #[link_name = "llvm.umin.v4i32"] fn vmnlf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
-    #[link_name = "llvm.umin.v2i64"] fn vmnlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
-
    #[link_name = "llvm.nearbyint.v4f32"] fn nearbyint_v4f32(a: vector_float) -> vector_float;
    #[link_name = "llvm.nearbyint.v2f64"] fn nearbyint_v2f64(a: vector_double) -> vector_double;

@ -683,17 +663,40 @@ mod sealed {
        unsafe fn vec_max(self, b: Other) -> Self::Result;
    }

-    test_impl! { vec_vmxsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmxb, vmxb] }
-    test_impl! { vec_vmxsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmxh, vmxh] }
-    test_impl! { vec_vmxsf (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmxf, vmxf] }
-    test_impl! { vec_vmxsg (a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [vmxg, vmxg] }
+    macro_rules! impl_max {
+        ($name:ident, $a:ty, $instr:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $name(a: $a, b: $a) -> $a {
+                simd_select(simd_ge::<_, $a>(a, b), a, b)
+            }

-    test_impl! { vec_vmxslb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmxlb, vmxlb] }
-    test_impl! { vec_vmxslh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmxlh, vmxlh] }
-    test_impl! { vec_vmxslf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmxlf, vmxlf] }
-    test_impl! { vec_vmxslg (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vmxlg, vmxlg] }
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorMax<Self> for $a {
+                type Result = Self;

-    impl_vec_trait! { [VectorMax vec_max] ~(vmxlb, vmxb, vmxlh, vmxh, vmxlf, vmxf, vmxlg, vmxg) }
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_max(self, other: Self) -> Self {
+                    $name(self, other)
+                }
+            }
+        };
+    }
+
+    mod impl_max {
+        use super::*;
+
+        impl_max!(vec_vmxsc, vector_signed_char, vmxb);
+        impl_max!(vec_vmxslc, vector_unsigned_char, vmxlb);
+        impl_max!(vec_vmxsh, vector_signed_short, vmxh);
+        impl_max!(vec_vmxslh, vector_unsigned_short, vmxlh);
+        impl_max!(vec_vmxsf, vector_signed_int, vmxf);
+        impl_max!(vec_vmxslf, vector_unsigned_int, vmxlf);
+        impl_max!(vec_vmxsg, vector_signed_long_long, vmxg);
+        impl_max!(vec_vmxslg, vector_unsigned_long_long, vmxlg);
+    }

    test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [simd_fmax, "vector-enhancements-1" vfmaxsb ] }
    test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [simd_fmax, "vector-enhancements-1" vfmaxdb] }
@ -707,17 +710,40 @@ mod sealed {
        unsafe fn vec_min(self, b: Other) -> Self::Result;
    }

-    test_impl! { vec_vmnsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmnb, vmnb] }
-    test_impl! { vec_vmnsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmnh, vmnh] }
-    test_impl! { vec_vmnsf (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmnf, vmnf] }
-    test_impl! { vec_vmnsg (a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [vmng, vmng] }
+    macro_rules! impl_min {
+        ($name:ident, $a:ty, $instr:ident) => {
+            #[inline]
+            #[target_feature(enable = "vector")]
+            #[cfg_attr(test, assert_instr($instr))]
+            pub unsafe fn $name(a: $a, b: $a) -> $a {
+                simd_select(simd_le::<_, $a>(a, b), a, b)
+            }

-    test_impl! { vec_vmnslb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmnlb, vmnlb] }
-    test_impl! { vec_vmnslh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmnlh, vmnlh] }
-    test_impl! { vec_vmnslf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmnlf, vmnlf] }
-    test_impl! { vec_vmnslg (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vmnlg, vmnlg] }
+            #[unstable(feature = "stdarch_s390x", issue = "135681")]
+            impl VectorMin<Self> for $a {
+                type Result = Self;

-    impl_vec_trait! { [VectorMin vec_min] ~(vmxlb, vmxb, vmxlh, vmxh, vmxlf, vmxf, vmxlg, vmxg) }
+                #[inline]
+                #[target_feature(enable = "vector")]
+                unsafe fn vec_min(self, other: Self) -> Self {
+                    $name(self, other)
+                }
+            }
+        };
+    }
+
+    mod impl_min {
+        use super::*;
+
+        impl_min!(vec_vmnsc, vector_signed_char, vmnb);
+        impl_min!(vec_vmnslc, vector_unsigned_char, vmnlb);
+        impl_min!(vec_vmnsh, vector_signed_short, vmnh);
+        impl_min!(vec_vmnslh, vector_unsigned_short, vmnlh);
+        impl_min!(vec_vmnsf, vector_signed_int, vmnf);
+        impl_min!(vec_vmnslf, vector_unsigned_int, vmnlf);
+        impl_min!(vec_vmnsg, vector_signed_long_long, vmng);
+        impl_min!(vec_vmnslg, vector_unsigned_long_long, vmnlg);
+    }

    test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [simd_fmin, "vector-enhancements-1" vfminsb]  }
    test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [simd_fmin, "vector-enhancements-1" vfmindb]  }
@ -2368,17 +2394,13 @@ mod sealed {
        unsafe fn vec_packsu(self, b: Other) -> Self::Result;
    }

-    unsafe fn simd_smax<T: Copy>(a: T, b: T) -> T {
-        simd_select::<T, T>(simd_gt::<T, T>(a, b), a, b)
-    }
-
    #[inline]
    #[target_feature(enable = "vector")]
    #[cfg_attr(test, assert_instr(vpklsh))]
    unsafe fn vpacksuh(a: vector_signed_short, b: vector_signed_short) -> vector_unsigned_char {
        vpklsh(
-            simd_smax(a, vector_signed_short([0; 8])),
-            simd_smax(b, vector_signed_short([0; 8])),
+            vec_max(a, vector_signed_short([0; 8])),
+            vec_max(b, vector_signed_short([0; 8])),
        )
    }
    #[inline]
@ -2386,8 +2408,8 @@ mod sealed {
    #[cfg_attr(test, assert_instr(vpklsf))]
    unsafe fn vpacksuf(a: vector_signed_int, b: vector_signed_int) -> vector_unsigned_short {
        vpklsf(
-            simd_smax(a, vector_signed_int([0; 4])),
-            simd_smax(b, vector_signed_int([0; 4])),
+            vec_max(a, vector_signed_int([0; 4])),
+            vec_max(b, vector_signed_int([0; 4])),
        )
    }
    #[inline]
@ -2398,8 +2420,8 @@ mod sealed {
        b: vector_signed_long_long,
    ) -> vector_unsigned_int {
        vpklsg(
-            simd_smax(a, vector_signed_long_long([0; 2])),
-            simd_smax(b, vector_signed_long_long([0; 2])),
+            vec_max(a, vector_signed_long_long([0; 2])),
+            vec_max(b, vector_signed_long_long([0; 2])),
        )
    }

--- a/library/stdarch/crates/core_arch/src/x86/adx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/adx.rs
@ -5,8 +5,6 @@ use stdarch_test::assert_instr;
 unsafe extern "unadjusted" {
    #[link_name = "llvm.x86.addcarry.32"]
    fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
-    #[link_name = "llvm.x86.addcarryx.u32"]
-    fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
    #[link_name = "llvm.x86.subborrow.32"]
    fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
 }
@ -19,8 +17,8 @@ unsafe extern "unadjusted" {
 #[inline]
 #[cfg_attr(test, assert_instr(adc))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    let (a, b) = llvm_addcarry_u32(c_in, a, b);
+pub fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = unsafe { llvm_addcarry_u32(c_in, a, b) };
    *out = b;
    a
 }
@ -34,8 +32,8 @@ pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
 #[target_feature(enable = "adx")]
 #[cfg_attr(test, assert_instr(adc))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    llvm_addcarryx_u32(c_in, a, b, out as *mut _)
+pub fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    _addcarry_u32(c_in, a, b, out)
 }

 /// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
@ -46,8 +44,8 @@ pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
 #[inline]
 #[cfg_attr(test, assert_instr(sbb))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    let (a, b) = llvm_subborrow_u32(c_in, a, b);
+pub fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    let (a, b) = unsafe { llvm_subborrow_u32(c_in, a, b) };
    *out = b;
    a
 }
@ -60,38 +58,36 @@ mod tests {

    #[test]
    fn test_addcarry_u32() {
-        unsafe {
-            let a = u32::MAX;
-            let mut out = 0;
+        let a = u32::MAX;
+        let mut out = 0;

-            let r = _addcarry_u32(0, a, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 0);
+        let r = _addcarry_u32(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);

-            let r = _addcarry_u32(0, a, 0, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, a);
+        let r = _addcarry_u32(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);

-            let r = _addcarry_u32(1, a, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 1);
+        let r = _addcarry_u32(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);

-            let r = _addcarry_u32(1, a, 0, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 0);
+        let r = _addcarry_u32(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);

-            let r = _addcarry_u32(0, 3, 4, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 7);
+        let r = _addcarry_u32(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);

-            let r = _addcarry_u32(1, 3, 4, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 8);
-        }
+        let r = _addcarry_u32(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
    }

    #[simd_test(enable = "adx")]
-    unsafe fn test_addcarryx_u32() {
+    fn test_addcarryx_u32() {
        let a = u32::MAX;
        let mut out = 0;

@ -121,44 +117,39 @@ mod tests {
    }

    #[simd_test(enable = "adx")]
-    unsafe fn test_addcarryx_u32_2() {
-        unsafe fn add_1_2_3() -> u32 {
-            let mut out = 0;
-            _addcarryx_u32(1, 2, 3, &mut out);
-            out
-        }
-        assert_eq!(6, add_1_2_3());
+    fn test_addcarryx_u32_2() {
+        let mut out = 0;
+        _addcarryx_u32(1, 2, 3, &mut out);
+        assert_eq!(6, out);
    }

    #[test]
    fn test_subborrow_u32() {
-        unsafe {
-            let a = u32::MAX;
-            let mut out = 0;
+        let a = u32::MAX;
+        let mut out = 0;

-            let r = _subborrow_u32(0, 0, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a);
+        let r = _subborrow_u32(0, 0, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, a);

-            let r = _subborrow_u32(0, 0, 0, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 0);
+        let r = _subborrow_u32(0, 0, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 0);

-            let r = _subborrow_u32(1, 0, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a - 1);
+        let r = _subborrow_u32(1, 0, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, a - 1);

-            let r = _subborrow_u32(1, 0, 0, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a);
+        let r = _subborrow_u32(1, 0, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, a);

-            let r = _subborrow_u32(0, 7, 3, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 4);
+        let r = _subborrow_u32(0, 7, 3, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 4);

-            let r = _subborrow_u32(1, 7, 3, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 3);
-        }
+        let r = _subborrow_u32(1, 7, 3, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 3);
    }
 }
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@ -587,7 +587,11 @@ pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vhaddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe { vhaddpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        simd_add(even, odd)
+    }
 }

 /// Horizontal addition of adjacent pairs in the two packed vectors
@ -602,7 +606,11 @@ pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vhaddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe { vhaddps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_add(even, odd)
+    }
 }

 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@ -616,7 +624,11 @@ pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vhsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe { vhsubpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        simd_sub(even, odd)
+    }
 }

 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@ -631,7 +643,11 @@ pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vhsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe { vhsubps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_sub(even, odd)
+    }
 }

 /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@ -1218,7 +1234,10 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
+    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
+        _mm256_castps_si256(a),
+        _mm256_castps_si256(b),
+    ))
 }

 /// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@ -1232,7 +1251,10 @@ pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
+    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
+        _mm256_castpd_si256(a),
+        _mm256_castpd_si256(b),
+    ))
 }

 /// Shuffles 128-bits (composed of integer data) selected by `imm8`
@ -1246,7 +1268,35 @@ pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
+    const fn idx(imm8: i32, pos: u32) -> u32 {
+        let part = if pos < 2 {
+            imm8 & 0xf
+        } else {
+            (imm8 & 0xf0) >> 4
+        };
+        2 * (part as u32 & 0b11) + (pos & 1)
+    }
+    const fn idx0(imm8: i32, pos: u32) -> u32 {
+        let part = if pos < 2 {
+            imm8 & 0xf
+        } else {
+            (imm8 & 0xf0) >> 4
+        };
+        if part & 0b1000 != 0 { 4 } else { pos }
+    }
+    unsafe {
+        let r = simd_shuffle!(
+            a.as_i64x4(),
+            b.as_i64x4(),
+            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
+        );
+        let r: i64x4 = simd_shuffle!(
+            r,
+            i64x4::ZERO,
+            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
+        );
+        r.as_m256i()
+    }
 }

 /// Broadcasts a single-precision (32-bit) floating-point element from memory
@ -1783,6 +1833,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vmovntdq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("vmovntdq", ",{a}"),
        p = in(reg) mem_addr,
@ -1811,6 +1862,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("vmovntpd", ",{a}"),
        p = in(reg) mem_addr,
@ -1840,6 +1892,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("vmovntps", ",{a}"),
        p = in(reg) mem_addr,
@ -1933,7 +1986,10 @@ pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
+    unsafe {
+        let r = simd_and(a.as_i64x4(), b.as_i64x4());
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }

 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@ -1947,7 +2003,10 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
-    unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
+    unsafe {
+        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }

 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@ -2031,7 +2090,10 @@ pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
-    unsafe { vtestzpd(a, b) }
+    unsafe {
+        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }

 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@ -2048,7 +2110,10 @@ pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
-    unsafe { vtestcpd(a, b) }
+    unsafe {
+        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }

 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@ -2135,7 +2200,10 @@ pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
-    unsafe { vtestzps(a, b) }
+    unsafe {
+        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
+        (0i32 == simd_reduce_or(r)) as i32
+    }
 }

 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@ -2152,7 +2220,10 @@ pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
-    unsafe { vtestcps(a, b) }
+    unsafe {
+        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
+        (0i32 == simd_reduce_or(r)) as i32
+    }
 }

 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@ -3044,14 +3115,6 @@ unsafe extern "C" {
    fn roundps256(a: __m256, b: i32) -> __m256;
    #[link_name = "llvm.x86.avx.dp.ps.256"]
    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
-    #[link_name = "llvm.x86.avx.hadd.pd.256"]
-    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.hadd.ps.256"]
-    fn vhaddps(a: __m256, b: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.hsub.pd.256"]
-    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.hsub.ps.256"]
-    fn vhsubps(a: __m256, b: __m256) -> __m256;
    #[link_name = "llvm.x86.sse2.cmp.pd"]
    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
    #[link_name = "llvm.x86.avx.cmp.pd.256"]
@ -3084,12 +3147,6 @@ unsafe extern "C" {
    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
-    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
-    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
-    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
-    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
-    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
-    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
    #[link_name = "llvm.x86.avx.maskload.pd.256"]
    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
@ -3112,10 +3169,6 @@ unsafe extern "C" {
    fn vrcpps(a: __m256) -> __m256;
    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
    fn vrsqrtps(a: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.ptestz.256"]
-    fn ptestz256(a: i64x4, b: i64x4) -> i32;
-    #[link_name = "llvm.x86.avx.ptestc.256"]
-    fn ptestc256(a: i64x4, b: i64x4) -> i32;
    #[link_name = "llvm.x86.avx.ptestnzc.256"]
    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
@ -3124,10 +3177,6 @@ unsafe extern "C" {
    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
-    #[link_name = "llvm.x86.avx.vtestz.pd"]
-    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
-    #[link_name = "llvm.x86.avx.vtestc.pd"]
-    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
@ -3136,10 +3185,6 @@ unsafe extern "C" {
    fn vtestcps256(a: __m256, b: __m256) -> i32;
    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
-    #[link_name = "llvm.x86.avx.vtestz.ps"]
-    fn vtestzps(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.avx.vtestc.ps"]
-    fn vtestcps(a: __m128, b: __m128) -> i32;
    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
    fn vtestnzcps(a: __m128, b: __m128) -> i32;
    #[link_name = "llvm.x86.avx.min.ps.256"]
@ -4249,6 +4294,7 @@ mod tests {
        let a = _mm256_setr_epi64x(1, 2, 3, 4);
        let mut r = _mm256_undefined_si256();
        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
+        _mm_sfence();
        assert_eq_m256i(r, a);
    }

@ -4263,6 +4309,7 @@ mod tests {
        let mut mem = Memory { data: [-1.0; 4] };

        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
+        _mm_sfence();
        for i in 0..4 {
            assert_eq!(mem.data[i], get_m256d(a, i));
        }
@ -4279,6 +4326,7 @@ mod tests {
        let mut mem = Memory { data: [-1.0; 8] };

        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
+        _mm_sfence();
        for i in 0..8 {
            assert_eq!(mem.data[i], get_m256(a, i));
        }
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@ -891,7 +891,21 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
 #[cfg_attr(test, assert_instr(vphaddw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_add(even, odd).as_m256i()
+    }
 }

 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@ -902,7 +916,13 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphaddd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    unsafe {
+        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_add(even, odd).as_m256i()
+    }
 }

 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@ -925,7 +945,21 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphsubw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_sub(even, odd).as_m256i()
+    }
 }

 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@ -936,7 +970,13 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphsubd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    unsafe {
+        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_sub(even, odd).as_m256i()
+    }
 }

 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@ -1714,7 +1754,12 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
+        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_add(even, odd).as_m256i()
+    }
 }

 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
@ -2285,7 +2330,7 @@ pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
-    unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
+    _mm256_permute2f128_si256::<IMM8>(a, b)
 }

 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
@ -3594,20 +3639,10 @@ pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {

 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.avx2.phadd.w"]
-    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phadd.d"]
-    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
    #[link_name = "llvm.x86.avx2.phadd.sw"]
    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.w"]
-    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.d"]
-    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
    #[link_name = "llvm.x86.avx2.phsub.sw"]
    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmadd.wd"]
-    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
    #[link_name = "llvm.x86.avx2.maskload.d"]
@ -3688,8 +3723,6 @@ unsafe extern "C" {
    fn permd(a: u32x8, b: u32x8) -> u32x8;
    #[link_name = "llvm.x86.avx2.permps"]
    fn permps(a: __m256, b: i32x8) -> __m256;
-    #[link_name = "llvm.x86.avx2.vperm2i128"]
-    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
    #[link_name = "llvm.x86.avx2.gather.d.d"]
    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@ -5835,7 +5835,20 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
+        let even: i32x16 = simd_shuffle!(
+            r,
+            r,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        );
+        let odd: i32x16 = simd_shuffle!(
+            r,
+            r,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        );
+        simd_add(even, odd).as_m512i()
+    }
 }

 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -10427,7 +10440,7 @@ pub fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
-    a << COUNT
+    a.unbounded_shl(COUNT)
 }

 /// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
@ -10438,7 +10451,7 @@ pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
-    a << COUNT
+    a.unbounded_shl(COUNT)
 }

 /// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
@ -10449,7 +10462,7 @@ pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
-    a >> COUNT
+    a.unbounded_shr(COUNT)
 }

 /// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
@ -10460,7 +10473,7 @@ pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
-    a >> COUNT
+    a.unbounded_shr(COUNT)
 }

 /// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst,
@ -11617,8 +11630,6 @@ unsafe extern "C" {
    #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
    fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;

-    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
-    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
    #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
    fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;

@ -20325,6 +20336,18 @@ mod tests {
        let r = _kshiftli_mask32::<3>(a);
        let e: __mmask32 = 0b0100101101001011_0100101101001000;
        assert_eq!(r, e);
+
+        let r = _kshiftli_mask32::<31>(a);
+        let e: __mmask32 = 0b1000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask32::<32>(a);
+        let e: __mmask32 = 0b0000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask32::<33>(a);
+        let e: __mmask32 = 0b0000000000000000_0000000000000000;
+        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512bw")]
@ -20333,21 +20356,61 @@ mod tests {
        let r = _kshiftli_mask64::<3>(a);
        let e: __mmask64 = 0b0110100101101001011_0100101101001000;
        assert_eq!(r, e);
+
+        let r = _kshiftli_mask64::<63>(a);
+        let e: __mmask64 = 0b1000000000000000_0000000000000000_0000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask64::<64>(a);
+        let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask64::<65>(a);
+        let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
+        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512bw")]
    unsafe fn test_kshiftri_mask32() {
-        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let a: __mmask32 = 0b1010100101101001_0110100101101001;
        let r = _kshiftri_mask32::<3>(a);
-        let e: __mmask32 = 0b0000110100101101_0010110100101101;
+        let e: __mmask32 = 0b0001010100101101_0010110100101101;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask32::<31>(a);
+        let e: __mmask32 = 0b0000000000000000_0000000000000001;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask32::<32>(a);
+        let e: __mmask32 = 0b0000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask32::<33>(a);
+        let e: __mmask32 = 0b0000000000000000_0000000000000000;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512bw")]
    unsafe fn test_kshiftri_mask64() {
-        let a: __mmask64 = 0b0110100101101001011_0100101101001000;
+        let a: __mmask64 = 0b1010100101101001011_0100101101001000;
        let r = _kshiftri_mask64::<3>(a);
-        let e: __mmask64 = 0b0110100101101001_0110100101101001;
+        let e: __mmask64 = 0b1010100101101001_0110100101101001;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask64::<34>(a);
+        let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000001;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask64::<35>(a);
+        let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask64::<64>(a);
+        let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask64::<65>(a);
+        let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
        assert_eq!(r, e);
    }

--- a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
@ -4602,7 +4602,7 @@ pub fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
-    a << COUNT
+    a.unbounded_shl(COUNT)
 }

 /// Shift 8-bit mask a right by count bits while shifting in zeros, and store the result in dst.
@ -4613,7 +4613,7 @@ pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
-    a >> COUNT
+    a.unbounded_shr(COUNT)
 }

 /// Compute the bitwise AND of 16-bit masks a and b, and if the result is all zeros, store 1 in dst,
@ -9856,13 +9856,37 @@ mod tests {
        let r = _kshiftli_mask8::<3>(a);
        let e: __mmask8 = 0b01001000;
        assert_eq!(r, e);
+
+        let r = _kshiftli_mask8::<7>(a);
+        let e: __mmask8 = 0b10000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask8::<8>(a);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask8::<9>(a);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512dq")]
    unsafe fn test_kshiftri_mask8() {
-        let a: __mmask8 = 0b01101001;
+        let a: __mmask8 = 0b10101001;
        let r = _kshiftri_mask8::<3>(a);
-        let e: __mmask8 = 0b00001101;
+        let e: __mmask8 = 0b00010101;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask8::<7>(a);
+        let e: __mmask8 = 0b00000001;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask8::<8>(a);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask8::<9>(a);
+        let e: __mmask8 = 0b00000000;
        assert_eq!(r, e);
    }

--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@ -19077,12 +19077,8 @@ pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rolv_epi32(a, _mm512_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19094,12 +19090,8 @@ pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rolv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19111,12 +19103,8 @@ pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rolv_epi32(k, a, _mm512_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19128,12 +19116,8 @@ pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m5
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rolv_epi32(a, _mm256_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19145,12 +19129,8 @@ pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rolv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19162,12 +19142,8 @@ pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rolv_epi32(k, a, _mm256_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19179,12 +19155,8 @@ pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rolv_epi32(a, _mm_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19196,12 +19168,8 @@ pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rolv_epi32(src, k, a, _mm_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19213,12 +19181,8 @@ pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rolv_epi32(k, a, _mm_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19230,12 +19194,8 @@ pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rorv_epi32(a, _mm512_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19247,12 +19207,8 @@ pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rorv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19264,12 +19220,8 @@ pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rorv_epi32(k, a, _mm512_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19281,12 +19233,8 @@ pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m5
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rorv_epi32(a, _mm256_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19298,12 +19246,8 @@ pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rorv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19315,12 +19259,8 @@ pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rorv_epi32(k, a, _mm256_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19332,12 +19272,8 @@ pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rorv_epi32(a, _mm_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19349,12 +19285,8 @@ pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rorv_epi32(src, k, a, _mm_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19366,12 +19298,8 @@ pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rorv_epi32(k, a, _mm_set1_epi32(IMM8))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19383,12 +19311,8 @@ pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rolv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19400,12 +19324,8 @@ pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rolv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19417,12 +19337,8 @@ pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m5
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rolv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19434,12 +19350,8 @@ pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m51
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rolv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19451,12 +19363,8 @@ pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rolv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19468,12 +19376,8 @@ pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rolv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19485,12 +19389,8 @@ pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rolv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19502,12 +19402,8 @@ pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rolv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19519,12 +19415,8 @@ pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rolv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19536,12 +19428,8 @@ pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rorv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19553,12 +19441,8 @@ pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rorv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19570,12 +19454,8 @@ pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m5
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rorv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19587,12 +19467,8 @@ pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m51
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rorv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19604,12 +19480,8 @@ pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rorv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19621,12 +19493,8 @@ pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rorv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19638,12 +19506,8 @@ pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rorv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19655,12 +19519,8 @@ pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rorv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19672,12 +19532,8 @@ pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rorv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
 }

 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@ -21296,7 +21152,13 @@ pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvd))]
 pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_u32x16(),
+            a.as_u32x16(),
+            simd_and(b.as_u32x16(), u32x16::splat(31)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21335,7 +21197,13 @@ pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvd))]
 pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_u32x8(),
+            a.as_u32x8(),
+            simd_and(b.as_u32x8(), u32x8::splat(31)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21374,7 +21242,13 @@ pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvd))]
 pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            simd_and(b.as_u32x4(), u32x4::splat(31)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21413,7 +21287,13 @@ pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvd))]
 pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            a.as_u32x16(),
+            a.as_u32x16(),
+            simd_and(b.as_u32x16(), u32x16::splat(31)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21452,7 +21332,13 @@ pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvd))]
 pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            a.as_u32x8(),
+            a.as_u32x8(),
+            simd_and(b.as_u32x8(), u32x8::splat(31)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21491,7 +21377,13 @@ pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvd))]
 pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            a.as_u32x4(),
+            a.as_u32x4(),
+            simd_and(b.as_u32x4(), u32x4::splat(31)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21530,7 +21422,13 @@ pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvq))]
 pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_u64x8(),
+            a.as_u64x8(),
+            simd_and(b.as_u64x8(), u64x8::splat(63)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21569,7 +21467,13 @@ pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvq))]
 pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_u64x4(),
+            a.as_u64x4(),
+            simd_and(b.as_u64x4(), u64x4::splat(63)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21608,7 +21512,13 @@ pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvq))]
 pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_u64x2(),
+            a.as_u64x2(),
+            simd_and(b.as_u64x2(), u64x2::splat(63)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21647,7 +21557,13 @@ pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvq))]
 pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            a.as_u64x8(),
+            a.as_u64x8(),
+            simd_and(b.as_u64x8(), u64x8::splat(63)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21686,7 +21602,13 @@ pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvq))]
 pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            a.as_u64x4(),
+            a.as_u64x4(),
+            simd_and(b.as_u64x4(), u64x4::splat(63)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21725,7 +21647,13 @@ pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvq))]
 pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            a.as_u64x2(),
+            a.as_u64x2(),
+            simd_and(b.as_u64x2(), u64x2::splat(63)),
+        ))
+    }
 }

 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -29234,7 +29162,7 @@ pub fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
-    a << COUNT
+    a.unbounded_shl(COUNT)
 }

 /// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst.
@ -29245,7 +29173,7 @@ pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
-    a >> COUNT
+    a.unbounded_shr(COUNT)
 }

 /// Load 16-bit mask from memory
@ -29665,6 +29593,7 @@ pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask
 #[cfg_attr(test, assert_instr(vmovntps))]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("vmovntps", ",{a}"),
        p = in(reg) mem_addr,
@ -29691,6 +29620,7 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
 #[cfg_attr(test, assert_instr(vmovntpd))]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("vmovntpd", ",{a}"),
        p = in(reg) mem_addr,
@ -29717,6 +29647,7 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
 #[cfg_attr(test, assert_instr(vmovntdq))]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("vmovntdq", ",{a}"),
        p = in(reg) mem_addr,
@ -42902,62 +42833,6 @@ unsafe extern "C" {
    #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
    fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;

-    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
-    fn vprold(a: i32x16, i8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
-    fn vprold256(a: i32x8, i8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
-    fn vprold128(a: i32x4, i8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
-    fn vprord(a: i32x16, i8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
-    fn vprord256(a: i32x8, i8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
-    fn vprord128(a: i32x4, i8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
-    fn vprolq(a: i64x8, i8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
-    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
-    fn vprolq128(a: i64x2, i8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
-    fn vprorq(a: i64x8, i8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
-    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
-    fn vprorq128(a: i64x2, i8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
-    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
-    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
-    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
-    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
-    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
-    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
-    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
-    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
-    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
-    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
-    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
-    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
-
    #[link_name = "llvm.x86.avx512.psllv.d.512"]
    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
@ -56220,13 +56095,37 @@ mod tests {
        let r = _kshiftli_mask16::<3>(a);
        let e: __mmask16 = 0b1011011000011000;
        assert_eq!(r, e);
+
+        let r = _kshiftli_mask16::<15>(a);
+        let e: __mmask16 = 0b1000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask16::<16>(a);
+        let e: __mmask16 = 0b0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftli_mask16::<17>(a);
+        let e: __mmask16 = 0b0000000000000000;
+        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512dq")]
    unsafe fn test_kshiftri_mask16() {
-        let a: __mmask16 = 0b0110100100111100;
+        let a: __mmask16 = 0b1010100100111100;
        let r = _kshiftri_mask16::<3>(a);
-        let e: __mmask16 = 0b0000110100100111;
+        let e: __mmask16 = 0b0001010100100111;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask16::<15>(a);
+        let e: __mmask16 = 0b0000000000000001;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask16::<16>(a);
+        let e: __mmask16 = 0b0000000000000000;
+        assert_eq!(r, e);
+
+        let r = _kshiftri_mask16::<17>(a);
+        let e: __mmask16 = 0b0000000000000000;
        assert_eq!(r, e);
    }

@ -56432,6 +56331,7 @@ mod tests {
        let mut mem = Memory { data: [-1.0; 16] };

        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
+        _mm_sfence();
        for i in 0..16 {
            assert_eq!(mem.data[i], get_m512(a, i));
        }
@ -56448,6 +56348,7 @@ mod tests {
        let mut mem = Memory { data: [-1.0; 8] };

        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+        _mm_sfence();
        for i in 0..8 {
            assert_eq!(mem.data[i], get_m512d(a, i));
        }
@ -56464,6 +56365,7 @@ mod tests {
        let mut mem = Memory { data: [-1; 8] };

        _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
+        _mm_sfence();
        for i in 0..8 {
            assert_eq!(mem.data[i], get_m512i(a, i));
        }
--- a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
@ -1615,7 +1615,7 @@ pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
 }

 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -1628,7 +1628,16 @@ pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -1641,7 +1650,15 @@ pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@ -1927,7 +1944,7 @@ pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
 }

 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@ -1940,7 +1957,16 @@ pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@ -1953,7 +1979,15 @@ pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@ -2239,7 +2273,7 @@ pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
 }

 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -2252,7 +2286,16 @@ pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -2265,7 +2308,15 @@ pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@ -2551,7 +2602,7 @@ pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
 }

 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@ -2564,7 +2615,16 @@ pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@ -2577,7 +2637,15 @@ pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }

 /// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
@ -7116,7 +7184,11 @@ pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { vfmaddsubph_128(a, b, c) }
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(sub, add, [0, 9, 2, 11, 4, 13, 6, 15])
+    }
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@ -7167,7 +7239,15 @@ pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) ->
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { vfmaddsubph_256(a, b, c) }
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            sub,
+            add,
+            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
+        )
+    }
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@ -7218,7 +7298,18 @@ pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            sub,
+            add,
+            [
+                0, 33, 2, 35, 4, 37, 6, 39, 8, 41, 10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 20, 53,
+                22, 55, 24, 57, 26, 59, 28, 61, 30, 63
+            ]
+        )
+    }
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@ -7391,7 +7482,7 @@ pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
+    _mm_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@ -7442,7 +7533,7 @@ pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) ->
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
+    _mm256_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@ -7493,7 +7584,7 @@ pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
+    _mm512_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
 }

 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@ -11111,7 +11202,7 @@ pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
+pub fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
    unsafe {
        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
        let q = simd_shuffle!(
@ -16341,10 +16432,6 @@ unsafe extern "C" {
    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;

-    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
-    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
-    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
-    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;

--- a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
@ -500,7 +500,13 @@ pub fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
 pub fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(simd_funnel_shl(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i64x8(),
+            b.as_i64x8(),
+            simd_and(c.as_i64x8(), i64x8::splat(63)),
+        ))
+    }
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -539,7 +545,13 @@ pub fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
 pub fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(simd_funnel_shl(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i64x4(),
+            b.as_i64x4(),
+            simd_and(c.as_i64x4(), i64x4::splat(63)),
+        ))
+    }
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -578,7 +590,13 @@ pub fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
 pub fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(simd_funnel_shl(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i64x2(),
+            b.as_i64x2(),
+            simd_and(c.as_i64x2(), i64x2::splat(63)),
+        ))
+    }
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -617,7 +635,13 @@ pub fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
 pub fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(simd_funnel_shl(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i32x16(),
+            b.as_i32x16(),
+            simd_and(c.as_i32x16(), i32x16::splat(31)),
+        ))
+    }
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -656,7 +680,13 @@ pub fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
 pub fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(simd_funnel_shl(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i32x8(),
+            b.as_i32x8(),
+            simd_and(c.as_i32x8(), i32x8::splat(31)),
+        ))
+    }
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -695,7 +725,13 @@ pub fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
 pub fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(simd_funnel_shl(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i32x4(),
+            b.as_i32x4(),
+            simd_and(c.as_i32x4(), i32x4::splat(31)),
+        ))
+    }
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -734,7 +770,13 @@ pub fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
 pub fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(simd_funnel_shl(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i16x32(),
+            b.as_i16x32(),
+            simd_and(c.as_i16x32(), i16x32::splat(15)),
+        ))
+    }
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -773,7 +815,13 @@ pub fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
 pub fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(simd_funnel_shl(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            simd_and(c.as_i16x16(), i16x16::splat(15)),
+        ))
+    }
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -812,7 +860,13 @@ pub fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
 pub fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(simd_funnel_shl(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) }
+    unsafe {
+        transmute(simd_funnel_shl(
+            a.as_i16x8(),
+            b.as_i16x8(),
+            simd_and(c.as_i16x8(), i16x8::splat(15)),
+        ))
+    }
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -851,7 +905,13 @@ pub fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
 pub fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(simd_funnel_shr(b.as_i64x8(), a.as_i64x8(), c.as_i64x8())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i64x8(),
+            a.as_i64x8(),
+            simd_and(c.as_i64x8(), i64x8::splat(63)),
+        ))
+    }
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -890,7 +950,13 @@ pub fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
 pub fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(simd_funnel_shr(b.as_i64x4(), a.as_i64x4(), c.as_i64x4())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i64x4(),
+            a.as_i64x4(),
+            simd_and(c.as_i64x4(), i64x4::splat(63)),
+        ))
+    }
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -929,7 +995,13 @@ pub fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
 pub fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(simd_funnel_shr(b.as_i64x2(), a.as_i64x2(), c.as_i64x2())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i64x2(),
+            a.as_i64x2(),
+            simd_and(c.as_i64x2(), i64x2::splat(63)),
+        ))
+    }
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -968,7 +1040,13 @@ pub fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
 pub fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(simd_funnel_shr(b.as_i32x16(), a.as_i32x16(), c.as_i32x16())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i32x16(),
+            a.as_i32x16(),
+            simd_and(c.as_i32x16(), i32x16::splat(31)),
+        ))
+    }
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1007,7 +1085,13 @@ pub fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
 pub fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(simd_funnel_shr(b.as_i32x8(), a.as_i32x8(), c.as_i32x8())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i32x8(),
+            a.as_i32x8(),
+            simd_and(c.as_i32x8(), i32x8::splat(31)),
+        ))
+    }
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1046,7 +1130,13 @@ pub fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
 pub fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(simd_funnel_shr(b.as_i32x4(), a.as_i32x4(), c.as_i32x4())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i32x4(),
+            a.as_i32x4(),
+            simd_and(c.as_i32x4(), i32x4::splat(31)),
+        ))
+    }
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1085,7 +1175,13 @@ pub fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
 pub fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    unsafe { transmute(simd_funnel_shr(b.as_i16x32(), a.as_i16x32(), c.as_i16x32())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i16x32(),
+            a.as_i16x32(),
+            simd_and(c.as_i16x32(), i16x32::splat(15)),
+        ))
+    }
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1124,7 +1220,13 @@ pub fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
 pub fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    unsafe { transmute(simd_funnel_shr(b.as_i16x16(), a.as_i16x16(), c.as_i16x16())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i16x16(),
+            a.as_i16x16(),
+            simd_and(c.as_i16x16(), i16x16::splat(15)),
+        ))
+    }
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1163,7 +1265,13 @@ pub fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
 pub fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    unsafe { transmute(simd_funnel_shr(b.as_i16x8(), a.as_i16x8(), c.as_i16x8())) }
+    unsafe {
+        transmute(simd_funnel_shr(
+            b.as_i16x8(),
+            a.as_i16x8(),
+            simd_and(c.as_i16x8(), i16x8::splat(15)),
+        ))
+    }
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
--- a/library/stdarch/crates/core_arch/src/x86/bswap.rs
+++ b/library/stdarch/crates/core_arch/src/x86/bswap.rs
@ -10,7 +10,7 @@ use stdarch_test::assert_instr;
 #[inline]
 #[cfg_attr(test, assert_instr(bswap))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _bswap(x: i32) -> i32 {
+pub fn _bswap(x: i32) -> i32 {
    x.swap_bytes()
 }

@ -20,9 +20,7 @@ mod tests {

    #[test]
    fn test_bswap() {
-        unsafe {
-            assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
-            assert_eq!(_bswap(0x00000000), 0x00000000);
-        }
+        assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
+        assert_eq!(_bswap(0x00000000), 0x00000000);
    }
 }
--- a/library/stdarch/crates/core_arch/src/x86/f16c.rs
+++ b/library/stdarch/crates/core_arch/src/x86/f16c.rs
@ -3,16 +3,13 @@
 //! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769

 use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;

 #[cfg(test)]
 use stdarch_test::assert_instr;

 #[allow(improper_ctypes)]
 unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.vcvtph2ps.128"]
-    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
-    #[link_name = "llvm.x86.vcvtph2ps.256"]
-    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
    #[link_name = "llvm.x86.vcvtps2ph.128"]
    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
    #[link_name = "llvm.x86.vcvtps2ph.256"]
@ -29,7 +26,11 @@ unsafe extern "unadjusted" {
 #[cfg_attr(test, assert_instr("vcvtph2ps"))]
 #[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
 pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
-    unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
+    unsafe {
+        let a: f16x8 = transmute(a);
+        let a: f16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        simd_cast(a)
+    }
 }

 /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
@ -41,7 +42,10 @@ pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
 #[cfg_attr(test, assert_instr("vcvtph2ps"))]
 #[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
 pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
-    unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
+    unsafe {
+        let a: f16x8 = transmute(a);
+        simd_cast(a)
+    }
 }

 /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
--- a/library/stdarch/crates/core_arch/src/x86/gfni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/gfni.rs
@ -745,7 +745,6 @@ mod tests {
    #![allow(overflowing_literals)]

    use core::hint::black_box;
-    use core::intrinsics::size_of;
    use stdarch_test::simd_test;

    use crate::core_arch::x86::*;
@ -881,26 +880,20 @@ mod tests {
    }

    #[target_feature(enable = "sse2")]
-    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
-        let byte_offset = word_index * 16 / size_of::<T>();
-        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
+        let pointer = data.as_ptr().byte_add(word_index * 16) as *const __m128i;
        _mm_loadu_si128(black_box(pointer))
    }

    #[target_feature(enable = "avx")]
-    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
-        let byte_offset = word_index * 32 / size_of::<T>();
-        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
+        let pointer = data.as_ptr().byte_add(word_index * 32) as *const __m256i;
        _mm256_loadu_si256(black_box(pointer))
    }

    #[target_feature(enable = "avx512f")]
-    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
-        let byte_offset = word_index * 64 / size_of::<T>();
-        let pointer = data.as_ptr().add(byte_offset) as *const _;
+        let pointer = data.as_ptr().byte_add(word_index * 64) as *const __m512i;
        _mm512_loadu_si512(black_box(pointer))
    }

--- a/library/stdarch/crates/core_arch/src/x86/rdrand.rs
+++ b/library/stdarch/crates/core_arch/src/x86/rdrand.rs
@ -26,8 +26,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
-    let (v, flag) = x86_rdrand16_step();
+pub fn _rdrand16_step(val: &mut u16) -> i32 {
+    let (v, flag) = unsafe { x86_rdrand16_step() };
    *val = v;
    flag
 }
@ -40,8 +40,8 @@ pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
-    let (v, flag) = x86_rdrand32_step();
+pub fn _rdrand32_step(val: &mut u32) -> i32 {
+    let (v, flag) = unsafe { x86_rdrand32_step() };
    *val = v;
    flag
 }
@ -54,8 +54,8 @@ pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
-    let (v, flag) = x86_rdseed16_step();
+pub fn _rdseed16_step(val: &mut u16) -> i32 {
+    let (v, flag) = unsafe { x86_rdseed16_step() };
    *val = v;
    flag
 }
@ -68,8 +68,8 @@ pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
-    let (v, flag) = x86_rdseed32_step();
+pub fn _rdseed32_step(val: &mut u32) -> i32 {
+    let (v, flag) = unsafe { x86_rdseed32_step() };
    *val = v;
    flag
 }
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@ -882,7 +882,7 @@ pub fn _mm_cvtss_f32(a: __m128) -> f32 {
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
-    unsafe { cvtsi2ss(a, b) }
+    unsafe { simd_insert!(a, 0, b as f32) }
 }

 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
@ -1445,8 +1445,8 @@ pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sfence))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_sfence() {
-    sfence()
+pub fn _mm_sfence() {
+    unsafe { sfence() }
 }

 /// Gets the unsigned 32-bit value of the MXCSR control and status register.
@ -1887,6 +1887,8 @@ pub const _MM_HINT_ET1: i32 = 6;
 /// * Prefetching may also fail if there are not enough memory-subsystem
 ///   resources (e.g., request buffers).
 ///
+/// Note: this intrinsic is safe to use even though it takes a raw pointer argument. In general, this
+/// cannot change the behavior of the program, including not trapping on invalid pointers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
 #[inline]
@ -1897,11 +1899,13 @@ pub const _MM_HINT_ET1: i32 = 6;
 #[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
+pub fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
    static_assert_uimm_bits!(STRATEGY, 3);
    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
    // `locality` and `rw` are based on our `STRATEGY`.
-    prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
+    unsafe {
+        prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
+    }
 }

 /// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
@ -1989,8 +1993,6 @@ unsafe extern "C" {
    fn cvtss2si(a: __m128) -> i32;
    #[link_name = "llvm.x86.sse.cvttss2si"]
    fn cvttss2si(a: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.cvtsi2ss"]
-    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
    #[link_name = "llvm.x86.sse.sfence"]
    fn sfence();
    #[link_name = "llvm.x86.sse.stmxcsr"]
@ -2024,6 +2026,7 @@ unsafe extern "C" {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("movntps", ",{a}"),
        p = in(reg) mem_addr,
@ -3331,6 +3334,7 @@ mod tests {
        let mut mem = Memory { data: [-1.0; 4] };

        _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
+        _mm_sfence();
        for i in 0..4 {
            assert_eq!(mem.data[i], get_m128(a, i));
        }
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@ -19,10 +19,10 @@ use crate::{
 #[inline]
 #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_pause() {
+pub fn _mm_pause() {
    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
    // the SSE2 target-feature - therefore it does not require any target features
-    pause()
+    unsafe { pause() }
 }

 /// Invalidates and flushes the cache line that contains `p` from all levels of
@ -49,8 +49,8 @@ pub unsafe fn _mm_clflush(p: *const u8) {
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(lfence))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_lfence() {
-    lfence()
+pub fn _mm_lfence() {
+    unsafe { lfence() }
 }

 /// Performs a serializing operation on all load-from-memory and store-to-memory
@ -65,8 +65,8 @@ pub unsafe fn _mm_lfence() {
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mfence))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_mfence() {
-    mfence()
+pub fn _mm_mfence() {
+    unsafe { mfence() }
 }

 /// Adds packed 8-bit integers in `a` and `b`.
@ -201,7 +201,12 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
+    unsafe {
+        let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
+        let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
+        simd_add(even, odd).as_m128i()
+    }
 }

 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@ -1358,6 +1363,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 #[cfg_attr(test, assert_instr(movntdq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("movntdq",  ",{a}"),
        p = in(reg) mem_addr,
@ -1385,6 +1391,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
 #[cfg_attr(test, assert_instr(movnti))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
        p = in(reg) mem_addr,
@ -2417,7 +2424,10 @@ pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
 #[cfg_attr(test, assert_instr(cvtss2sd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
-    unsafe { cvtss2sd(a, b) }
+    unsafe {
+        let elt: f32 = simd_extract!(b, 0);
+        simd_insert!(a, 0, elt as f64)
+    }
 }

 /// Converts packed double-precision (64-bit) floating-point elements in `a` to
@ -2619,6 +2629,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("movntpd", ",{a}"),
        p = in(reg) mem_addr,
@ -3043,8 +3054,6 @@ unsafe extern "C" {
    fn lfence();
    #[link_name = "llvm.x86.sse2.mfence"]
    fn mfence();
-    #[link_name = "llvm.x86.sse2.pmadd.wd"]
-    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
    #[link_name = "llvm.x86.sse2.psad.bw"]
    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
    #[link_name = "llvm.x86.sse2.psll.w"]
@ -3115,8 +3124,6 @@ unsafe extern "C" {
    fn cvtsd2si(a: __m128d) -> i32;
    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
-    #[link_name = "llvm.x86.sse2.cvtss2sd"]
-    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
    fn cvttpd2dq(a: __m128d) -> i32x4;
    #[link_name = "llvm.x86.sse2.cvttsd2si"]
@ -3142,7 +3149,7 @@ mod tests {

    #[test]
    fn test_mm_pause() {
-        unsafe { _mm_pause() }
+        _mm_pause()
    }

    #[simd_test(enable = "sse2")]
@ -4066,6 +4073,7 @@ mod tests {
        );
        let mut r = _mm_set1_epi8(0);
        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
+        _mm_sfence();
        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m128i(r, e);
    }
@ -4102,6 +4110,7 @@ mod tests {
        let a = _mm_setr_epi32(1, 2, 3, 4);
        let mut r = _mm_undefined_si128();
        _mm_stream_si128(ptr::addr_of_mut!(r), a);
+        _mm_sfence();
        assert_eq_m128i(r, a);
    }

@ -4113,6 +4122,7 @@ mod tests {
        let a: i32 = 7;
        let mut mem = boxed::Box::<i32>::new(-1);
        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
+        _mm_sfence();
        assert_eq!(a, *mem);
    }

@ -4809,6 +4819,7 @@ mod tests {
        let mut mem = Memory { data: [-1.0; 2] };

        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
+        _mm_sfence();
        for i in 0..2 {
            assert_eq!(mem.data[i], get_m128d(a, i));
        }
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@ -51,7 +51,11 @@ pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(haddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { haddpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2]);
+        let odd = simd_shuffle!(a, b, [1, 3]);
+        simd_add(even, odd)
+    }
 }

 /// Horizontally adds adjacent pairs of single-precision (32-bit)
@ -63,7 +67,11 @@ pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(haddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { haddps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_add(even, odd)
+    }
 }

 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
@ -75,7 +83,11 @@ pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(hsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { hsubpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2]);
+        let odd = simd_shuffle!(a, b, [1, 3]);
+        simd_sub(even, odd)
+    }
 }

 /// Horizontally adds adjacent pairs of single-precision (32-bit)
@ -87,7 +99,11 @@ pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(hsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { hsubps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_sub(even, odd)
+    }
 }

 /// Loads 128-bits of integer data from unaligned memory.
@ -153,14 +169,6 @@ pub fn _mm_moveldup_ps(a: __m128) -> __m128 {

 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.sse3.hadd.pd"]
-    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hadd.ps"]
-    fn haddps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.hsub.pd"]
-    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hsub.ps"]
-    fn hsubps(a: __m128, b: __m128) -> __m128;
    #[link_name = "llvm.x86.sse3.ldu.dq"]
    fn lddqu(mem_addr: *const i8) -> i8x16;
 }
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@ -1006,7 +1006,10 @@ pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(ptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
+    unsafe {
+        let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
+        (0i64 == r) as i32
+    }
 }

 /// Tests whether the specified bits in a 128-bit integer vector are all
@ -1029,7 +1032,13 @@ pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 #[cfg_attr(test, assert_instr(ptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
+    unsafe {
+        let r = simd_reduce_or(simd_and(
+            simd_xor(a.as_i64x2(), i64x2::splat(!0)),
+            mask.as_i64x2(),
+        ));
+        (0i64 == r) as i32
+    }
 }

 /// Tests whether the specified bits in a 128-bit integer vector are
@ -1165,10 +1174,6 @@ unsafe extern "C" {
    fn phminposuw(a: u16x8) -> u16x8;
    #[link_name = "llvm.x86.sse41.mpsadbw"]
    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.ptestz"]
-    fn ptestz(a: i64x2, mask: i64x2) -> i32;
-    #[link_name = "llvm.x86.sse41.ptestc"]
-    fn ptestc(a: i64x2, mask: i64x2) -> i32;
    #[link_name = "llvm.x86.sse41.ptestnzc"]
    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
 }
--- a/library/stdarch/crates/core_arch/src/x86/sse4a.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse4a.rs
@ -15,10 +15,6 @@ unsafe extern "C" {
    fn insertq(x: i64x2, y: i64x2) -> i64x2;
    #[link_name = "llvm.x86.sse4a.insertqi"]
    fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
-    #[link_name = "llvm.x86.sse4a.movnt.sd"]
-    fn movntsd(x: *mut f64, y: __m128d);
-    #[link_name = "llvm.x86.sse4a.movnt.ss"]
-    fn movntss(x: *mut f32, y: __m128);
 }

 /// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
@ -114,7 +110,13 @@ pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i)
 #[cfg_attr(test, assert_instr(movntsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
-    movntsd(p, a);
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
+    crate::arch::asm!(
+        vps!("movntsd",  ",{a}"),
+        p = in(reg) p,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
 }

 /// Non-temporal store of `a.0` into `p`.
@ -134,7 +136,13 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
 #[cfg_attr(test, assert_instr(movntss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
-    movntss(p, a);
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
+    crate::arch::asm!(
+        vps!("movntss",  ",{a}"),
+        p = in(reg) p,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
 }

 #[cfg(test)]
@ -209,6 +217,7 @@ mod tests {
            let x = _mm_setr_pd(3.0, 4.0);

            _mm_stream_sd(d, x);
+            _mm_sfence();
        }
        assert_eq!(mem.data[0], 3.0);
        assert_eq!(mem.data[1], 2.0);
@ -234,6 +243,7 @@ mod tests {
            let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);

            _mm_stream_ss(d, x);
+            _mm_sfence();
        }
        assert_eq!(mem.data[0], 5.0);
        assert_eq!(mem.data[1], 2.0);
--- a/library/stdarch/crates/core_arch/src/x86/ssse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@ -164,7 +164,13 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phaddw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_add(even, odd).as_m128i()
+    }
 }

 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@ -189,7 +195,13 @@ pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phaddd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) }
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe {
+        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_add(even, odd).as_m128i()
+    }
 }

 /// Horizontally subtract the adjacent pairs of values contained in 2
@ -201,7 +213,13 @@ pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phsubw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_sub(even, odd).as_m128i()
+    }
 }

 /// Horizontally subtract the adjacent pairs of values contained in 2
@ -227,7 +245,13 @@ pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phsubd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) }
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe {
+        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_sub(even, odd).as_m128i()
+    }
 }

 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@ -305,24 +329,12 @@ unsafe extern "C" {
    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;

-    #[link_name = "llvm.x86.ssse3.phadd.w.128"]
-    fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
-
    #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
    fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;

-    #[link_name = "llvm.x86.ssse3.phadd.d.128"]
-    fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.ssse3.phsub.w.128"]
-    fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
-
    #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
    fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;

-    #[link_name = "llvm.x86.ssse3.phsub.d.128"]
-    fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
-
    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
    fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;

--- a/library/stdarch/crates/core_arch/src/x86/tbm.rs
+++ b/library/stdarch/crates/core_arch/src/x86/tbm.rs
@ -30,7 +30,7 @@ unsafe extern "C" {
 #[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
+pub fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
    static_assert_uimm_bits!(CONTROL, 16);
    unsafe { bextri_u32(a, CONTROL) }
 }
@ -42,7 +42,7 @@ pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcfill_u32(x: u32) -> u32 {
+pub fn _blcfill_u32(x: u32) -> u32 {
    x & (x.wrapping_add(1))
 }

@ -53,7 +53,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blci_u32(x: u32) -> u32 {
+pub fn _blci_u32(x: u32) -> u32 {
    x | !x.wrapping_add(1)
 }

@ -64,7 +64,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcic_u32(x: u32) -> u32 {
+pub fn _blcic_u32(x: u32) -> u32 {
    !x & x.wrapping_add(1)
 }

@ -76,7 +76,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
+pub fn _blcmsk_u32(x: u32) -> u32 {
    x ^ x.wrapping_add(1)
 }

@ -87,7 +87,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcs_u32(x: u32) -> u32 {
+pub fn _blcs_u32(x: u32) -> u32 {
    x | x.wrapping_add(1)
 }

@ -98,7 +98,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blsfill_u32(x: u32) -> u32 {
+pub fn _blsfill_u32(x: u32) -> u32 {
    x | x.wrapping_sub(1)
 }

@ -109,7 +109,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blsic_u32(x: u32) -> u32 {
+pub fn _blsic_u32(x: u32) -> u32 {
    !x | x.wrapping_sub(1)
 }

@ -121,7 +121,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
+pub fn _t1mskc_u32(x: u32) -> u32 {
    !x | x.wrapping_add(1)
 }

@ -133,7 +133,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
+pub fn _tzmsk_u32(x: u32) -> u32 {
    !x & x.wrapping_sub(1)
 }

--- a/library/stdarch/crates/core_arch/src/x86/xsave.rs
+++ b/library/stdarch/crates/core_arch/src/x86/xsave.rs
@ -159,29 +159,39 @@ pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
    xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }

+#[cfg(test)]
+pub(crate) use tests::XsaveArea;
+
 #[cfg(test)]
 mod tests {
-    use std::{fmt, prelude::v1::*};
+    use std::boxed::Box;

    use crate::core_arch::x86::*;
    use stdarch_test::simd_test;

-    #[repr(align(64))]
    #[derive(Debug)]
-    struct XsaveArea {
-        // max size for 256-bit registers is 800 bytes:
-        // see https://software.intel.com/en-us/node/682996
-        // max size for 512-bit registers is 2560 bytes:
-        // FIXME: add source
-        data: [u8; 2560],
+    pub(crate) struct XsaveArea {
+        data: Box<[AlignedArray]>,
    }

+    #[repr(align(64))]
+    #[derive(Copy, Clone, Debug)]
+    struct AlignedArray([u8; 64]);
+
    impl XsaveArea {
-        fn new() -> XsaveArea {
-            XsaveArea { data: [0; 2560] }
+        #[target_feature(enable = "xsave")]
+        pub(crate) fn new() -> XsaveArea {
+            // `CPUID.(EAX=0DH,ECX=0):ECX` contains the size required to hold all supported xsave
+            // components. `EBX` contains the size required to hold all xsave components currently
+            // enabled in `XCR0`. We are using `ECX` to ensure enough space in all scenarios
+            let CpuidResult { ecx, .. } = unsafe { __cpuid(0x0d) };
+
+            XsaveArea {
+                data: vec![AlignedArray([0; 64]); ecx.div_ceil(64) as usize].into_boxed_slice(),
+            }
        }
-        fn ptr(&mut self) -> *mut u8 {
-            self.data.as_mut_ptr()
+        pub(crate) fn ptr(&mut self) -> *mut u8 {
+            self.data.as_mut_ptr().cast()
        }
    }

--- a/library/stdarch/crates/core_arch/src/x86_64/adx.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/adx.rs
@ -5,8 +5,6 @@ use stdarch_test::assert_instr;
 unsafe extern "unadjusted" {
    #[link_name = "llvm.x86.addcarry.64"]
    fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
-    #[link_name = "llvm.x86.addcarryx.u64"]
-    fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u64) -> u8;
    #[link_name = "llvm.x86.subborrow.64"]
    fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
 }
@ -19,8 +17,8 @@ unsafe extern "unadjusted" {
 #[inline]
 #[cfg_attr(test, assert_instr(adc))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
-    let (a, b) = llvm_addcarry_u64(c_in, a, b);
+pub fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = unsafe { llvm_addcarry_u64(c_in, a, b) };
    *out = b;
    a
 }
@ -34,8 +32,8 @@ pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
 #[target_feature(enable = "adx")]
 #[cfg_attr(test, assert_instr(adc))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
-    llvm_addcarryx_u64(c_in, a, b, out as *mut _)
+pub fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    _addcarry_u64(c_in, a, b, out)
 }

 /// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`.
@ -46,8 +44,8 @@ pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
 #[inline]
 #[cfg_attr(test, assert_instr(sbb))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
-pub unsafe fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
-    let (a, b) = llvm_subborrow_u64(c_in, a, b);
+pub fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    let (a, b) = unsafe { llvm_subborrow_u64(c_in, a, b) };
    *out = b;
    a
 }
@ -60,38 +58,6 @@ mod tests {

    #[test]
    fn test_addcarry_u64() {
-        unsafe {
-            let a = u64::MAX;
-            let mut out = 0;
-
-            let r = _addcarry_u64(0, a, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 0);
-
-            let r = _addcarry_u64(0, a, 0, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, a);
-
-            let r = _addcarry_u64(1, a, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 1);
-
-            let r = _addcarry_u64(1, a, 0, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, 0);
-
-            let r = _addcarry_u64(0, 3, 4, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 7);
-
-            let r = _addcarry_u64(1, 3, 4, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 8);
-        }
-    }
-
-    #[simd_test(enable = "adx")]
-    unsafe fn test_addcarryx_u64() {
        let a = u64::MAX;
        let mut out = 0;

@ -120,35 +86,63 @@ mod tests {
        assert_eq!(out, 8);
    }

+    #[simd_test(enable = "adx")]
+    fn test_addcarryx_u64() {
+        let a = u64::MAX;
+        let mut out = 0;
+
+        let r = _addcarryx_u64(0, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u64(0, a, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, a);
+
+        let r = _addcarryx_u64(1, a, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 1);
+
+        let r = _addcarryx_u64(1, a, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, 0);
+
+        let r = _addcarryx_u64(0, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 7);
+
+        let r = _addcarryx_u64(1, 3, 4, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 8);
+    }
+
    #[test]
    fn test_subborrow_u64() {
-        unsafe {
-            let a = u64::MAX;
-            let mut out = 0;
+        let a = u64::MAX;
+        let mut out = 0;

-            let r = _subborrow_u64(0, 0, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a);
+        let r = _subborrow_u64(0, 0, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, a);

-            let r = _subborrow_u64(0, 0, 0, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 0);
+        let r = _subborrow_u64(0, 0, 0, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 0);

-            let r = _subborrow_u64(1, 0, 1, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a - 1);
+        let r = _subborrow_u64(1, 0, 1, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, a - 1);

-            let r = _subborrow_u64(1, 0, 0, &mut out);
-            assert_eq!(r, 1);
-            assert_eq!(out, a);
+        let r = _subborrow_u64(1, 0, 0, &mut out);
+        assert_eq!(r, 1);
+        assert_eq!(out, a);

-            let r = _subborrow_u64(0, 7, 3, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 4);
+        let r = _subborrow_u64(0, 7, 3, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 4);

-            let r = _subborrow_u64(1, 7, 3, &mut out);
-            assert_eq!(r, 0);
-            assert_eq!(out, 3);
-        }
+        let r = _subborrow_u64(1, 7, 3, &mut out);
+        assert_eq!(r, 0);
+        assert_eq!(out, 3);
    }
 }
--- a/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/bswap.rs
@ -11,7 +11,7 @@ use stdarch_test::assert_instr;
 #[inline]
 #[cfg_attr(test, assert_instr(bswap))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _bswap64(x: i64) -> i64 {
+pub fn _bswap64(x: i64) -> i64 {
    x.swap_bytes()
 }

@ -21,9 +21,7 @@ mod tests {

    #[test]
    fn test_bswap64() {
-        unsafe {
-            assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
-            assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
-        }
+        assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
+        assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
    }
 }
--- a/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/rdrand.rs
@ -23,8 +23,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
-    let (v, flag) = x86_rdrand64_step();
+pub fn _rdrand64_step(val: &mut u64) -> i32 {
+    let (v, flag) = unsafe { x86_rdrand64_step() };
    *val = v;
    flag
 }
@ -37,8 +37,8 @@ pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
-    let (v, flag) = x86_rdseed64_step();
+pub fn _rdseed64_step(val: &mut u64) -> i32 {
+    let (v, flag) = unsafe { x86_rdseed64_step() };
    *val = v;
    flag
 }
--- a/library/stdarch/crates/core_arch/src/x86_64/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse.rs
@ -11,8 +11,6 @@ unsafe extern "C" {
    fn cvtss2si64(a: __m128) -> i64;
    #[link_name = "llvm.x86.sse.cvttss2si64"]
    fn cvttss2si64(a: __m128) -> i64;
-    #[link_name = "llvm.x86.sse.cvtsi642ss"]
-    fn cvtsi642ss(a: __m128, b: i64) -> __m128;
 }

 /// Converts the lowest 32 bit float in the input vector to a 64 bit integer.
@ -65,7 +63,7 @@ pub fn _mm_cvttss_si64(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
-    unsafe { cvtsi642ss(a, b) }
+    unsafe { simd_insert!(a, 0, b as f32) }
 }

 #[cfg(test)]
--- a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
@ -78,6 +78,7 @@ pub fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(movnti))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
+    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
    crate::arch::asm!(
        vps!("movnti", ",{a}"),
        p = in(reg) mem_addr,
@ -200,6 +201,7 @@ mod tests {
        let a: i64 = 7;
        let mut mem = boxed::Box::<i64>::new(-1);
        _mm_stream_si64(ptr::addr_of_mut!(*mem), a);
+        _mm_sfence();
        assert_eq!(a, *mem);
    }

--- a/library/stdarch/crates/core_arch/src/x86_64/tbm.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/tbm.rs
@ -30,7 +30,7 @@ unsafe extern "C" {
 #[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86_updates", since = "1.82.0")]
-pub unsafe fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
+pub fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
    static_assert_uimm_bits!(CONTROL, 16);
    unsafe { bextri_u64(a, CONTROL) }
 }
@ -42,7 +42,7 @@ pub unsafe fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcfill_u64(x: u64) -> u64 {
+pub fn _blcfill_u64(x: u64) -> u64 {
    x & x.wrapping_add(1)
 }

@ -53,7 +53,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blci_u64(x: u64) -> u64 {
+pub fn _blci_u64(x: u64) -> u64 {
    x | !x.wrapping_add(1)
 }

@ -64,7 +64,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcic_u64(x: u64) -> u64 {
+pub fn _blcic_u64(x: u64) -> u64 {
    !x & x.wrapping_add(1)
 }

@ -76,7 +76,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
+pub fn _blcmsk_u64(x: u64) -> u64 {
    x ^ x.wrapping_add(1)
 }

@ -87,7 +87,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blcs_u64(x: u64) -> u64 {
+pub fn _blcs_u64(x: u64) -> u64 {
    x | x.wrapping_add(1)
 }

@ -98,7 +98,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blsfill_u64(x: u64) -> u64 {
+pub fn _blsfill_u64(x: u64) -> u64 {
    x | x.wrapping_sub(1)
 }

@ -109,7 +109,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _blsic_u64(x: u64) -> u64 {
+pub fn _blsic_u64(x: u64) -> u64 {
    !x | x.wrapping_sub(1)
 }

@ -121,7 +121,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
+pub fn _t1mskc_u64(x: u64) -> u64 {
    !x | x.wrapping_add(1)
 }

@ -133,7 +133,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
+pub fn _tzmsk_u64(x: u64) -> u64 {
    !x & x.wrapping_sub(1)
 }

--- a/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/xsave.rs
@ -126,29 +126,10 @@ pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {

 #[cfg(test)]
 mod tests {
-    use crate::core_arch::x86_64::xsave;
-    use std::fmt;
+    use crate::core_arch::x86::*;
+    use crate::core_arch::x86_64::*;
    use stdarch_test::simd_test;

-    #[repr(align(64))]
-    #[derive(Debug)]
-    struct XsaveArea {
-        // max size for 256-bit registers is 800 bytes:
-        // see https://software.intel.com/en-us/node/682996
-        // max size for 512-bit registers is 2560 bytes:
-        // FIXME: add source
-        data: [u8; 2560],
-    }
-
-    impl XsaveArea {
-        fn new() -> XsaveArea {
-            XsaveArea { data: [0; 2560] }
-        }
-        fn ptr(&mut self) -> *mut u8 {
-            self.data.as_mut_ptr()
-        }
-    }
-
    #[simd_test(enable = "xsave")]
    #[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
    unsafe fn test_xsave64() {
@ -156,9 +137,9 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsave64(a.ptr(), m);
-        xsave::_xrstor64(a.ptr(), m);
-        xsave::_xsave64(b.ptr(), m);
+        _xsave64(a.ptr(), m);
+        _xrstor64(a.ptr(), m);
+        _xsave64(b.ptr(), m);
    }

    #[simd_test(enable = "xsave,xsaveopt")]
@ -168,9 +149,9 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsaveopt64(a.ptr(), m);
-        xsave::_xrstor64(a.ptr(), m);
-        xsave::_xsaveopt64(b.ptr(), m);
+        _xsaveopt64(a.ptr(), m);
+        _xrstor64(a.ptr(), m);
+        _xsaveopt64(b.ptr(), m);
    }

    #[simd_test(enable = "xsave,xsavec")]
@ -180,8 +161,8 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsavec64(a.ptr(), m);
-        xsave::_xrstor64(a.ptr(), m);
-        xsave::_xsavec64(b.ptr(), m);
+        _xsavec64(a.ptr(), m);
+        _xrstor64(a.ptr(), m);
+        _xsavec64(b.ptr(), m);
    }
 }
--- a/library/stdarch/crates/intrinsic-test/Cargo.toml
+++ b/library/stdarch/crates/intrinsic-test/Cargo.toml
@ -19,3 +19,6 @@ pretty_env_logger = "0.5.0"
 rayon = "1.5.0"
 diff = "0.1.12"
 itertools = "0.14.0"
+quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] }
+serde-xml-rs = "0.8.0"
+regex = "1.11.1"
--- a/library/stdarch/crates/intrinsic-test/missing_x86.txt
+++ b/library/stdarch/crates/intrinsic-test/missing_x86.txt
@ -0,0 +1,904 @@
+# Are defined under a similar name
+
+#__bswap_64
+_bswap64
+
+# Provides pointer to allocated memory, which is difficult to test
+_mm_malloc
+
+# requires target feature 'waitpkg', but would be inlined into function that is compiled without support for 'waitpkg'
+_tpause
+_umwait
+
+# `use of undeclared identifier` error in Clang
+_bit_scan_forward
+_bit_scan_reverse
+_bswap
+_castf32_u32
+_castf64_u64
+_castu32_f32
+_castu64_f64
+_lrotl
+_lrotr
+_may_i_use_cpu_feature
+_may_i_use_cpu_feature_ext
+_mm256_acos_pd
+_mm256_acos_ph
+_mm256_acos_ps
+_mm256_acosh_pd
+_mm256_acosh_ph
+_mm256_acosh_ps
+_mm256_asin_pd
+_mm256_asin_ph
+_mm256_asin_ps
+_mm256_asinh_pd
+_mm256_asinh_ph
+_mm256_asinh_ps
+_mm256_atan_pd
+_mm256_atan_ps
+_mm256_atan_ph
+_mm256_atan2_pd
+_mm256_atan2_ph
+_mm256_atan2_ps
+_mm256_atanh_pd
+_mm256_atanh_ph
+_mm256_atanh_ps
+_mm256_cbrt_pd
+_mm256_cbrt_ph
+_mm256_cbrt_ps
+_mm256_cdfnorm_pd
+_mm256_cdfnorm_ph
+_mm256_cdfnorm_ps
+_mm256_cdfnorminv_pd
+_mm256_cdfnorminv_ph
+_mm256_cdfnorminv_ps
+_mm256_cexp_ps
+_mm256_cos_pd
+_mm256_cos_ph
+_mm256_cos_ps
+_mm256_cosd_pd
+_mm256_cosd_ph
+_mm256_cosd_ps
+_mm256_cosh_pd
+_mm256_cosh_ph
+_mm256_cosh_ps
+_mm256_csqrt_ps
+_mm256_div_epi16
+_mm256_div_epi32
+_mm256_div_epi64
+_mm256_div_epi8
+_mm256_div_epu16
+_mm256_div_epu32
+_mm256_div_epu64
+_mm256_div_epu8
+_mm256_dpbssd_epi32
+_mm256_dpbssds_epi32
+_mm256_dpbsud_epi32
+_mm256_dpbsuds_epi32
+_mm256_dpbuud_epi32
+_mm256_dpbuuds_epi32
+_mm256_dpwsud_epi32
+_mm256_dpwsuds_epi32
+_mm256_dpwusd_epi32
+_mm256_dpwusds_epi32
+_mm256_dpwuud_epi32
+_mm256_dpwuuds_epi32
+_mm256_erf_pd
+_mm256_erf_ps
+_mm256_erfc_pd
+_mm256_erfc_ph
+_mm256_erfc_ps
+_mm256_erfcinv_pd
+_mm256_erfcinv_ph
+_mm256_erfcinv_ps
+_mm256_erfinv_pd
+_mm256_erfinv_ph
+_mm256_erfinv_ps
+_mm256_exp10_pd
+_mm256_exp10_ph
+_mm256_exp10_ps
+_mm256_exp2_pd
+_mm256_exp2_ph
+_mm256_exp2_ps
+_mm256_exp_pd
+_mm256_exp_ph
+_mm256_exp_ps
+_mm256_expm1_pd
+_mm256_expm1_ph
+_mm256_expm1_ps
+_mm256_hypot_pd
+_mm256_hypot_ph
+_mm256_hypot_ps
+_mm256_idiv_epi32
+_mm256_invcbrt_pd
+_mm256_invcbrt_ph
+_mm256_invcbrt_ps
+_mm256_invsqrt_pd
+_mm256_invsqrt_ph
+_mm256_invsqrt_ps
+_mm256_irem_epi32
+_mm256_log10_pd
+_mm256_log10_ph
+_mm256_log10_ps
+_mm256_log1p_pd
+_mm256_log1p_ph
+_mm256_log1p_ps
+_mm256_log2_pd
+_mm256_log2_ph
+_mm256_log2_ps
+_mm256_log_pd
+_mm256_log_ph
+_mm256_log_ps
+_mm256_logb_pd
+_mm256_logb_ph
+_mm256_logb_ps
+_mm256_clog_ps
+_mm256_madd52hi_avx_epu64
+_mm256_madd52lo_avx_epu64
+_mm256_erf_ph
+_mm256_mask_reduce_add_epi16
+_mm256_mask_reduce_add_epi8
+_mm256_mask_reduce_and_epi16
+_mm256_mask_reduce_and_epi8
+_mm256_mask_reduce_max_epi16
+_mm256_mask_reduce_max_epi8
+_mm256_mask_reduce_max_epu16
+_mm256_mask_reduce_max_epu8
+_mm256_mask_reduce_min_epi16
+_mm256_mask_reduce_min_epi8
+_mm256_mask_reduce_min_epu16
+_mm256_mask_reduce_min_epu8
+_mm256_mask_reduce_mul_epi16
+_mm256_mask_reduce_mul_epi8
+_mm256_mask_reduce_or_epi16
+_mm256_mask_reduce_or_epi8
+_mm512_cosd_ph
+_mm512_cosd_ps
+_mm512_cosh_pd
+_mm512_cosh_ph
+_mm512_cosh_ps
+_mm512_div_epi16
+_mm512_div_epi32
+_mm512_div_epi64
+_mm512_div_epi8
+_mm512_div_epu16
+_mm512_div_epu32
+_mm512_div_epu64
+_mm512_div_epu8
+_mm512_erf_pd
+_mm512_erf_ph
+_mm512_erf_ps
+_mm512_erfc_pd
+_mm512_erfc_ph
+_mm512_erfc_ps
+_mm512_erfcinv_pd
+_mm512_erfcinv_ph
+_mm512_erfcinv_ps
+_mm512_erfinv_pd
+_mm512_erfinv_ph
+_mm512_erfinv_ps
+_mm512_exp10_pd
+_mm512_exp10_ph
+_mm512_exp10_ps
+_mm512_exp2_pd
+_mm512_exp2_ph
+_mm512_exp2_ps
+_mm512_exp_pd
+_mm512_exp_ph
+_mm512_exp_ps
+_mm512_expm1_pd
+_mm512_expm1_ph
+_mm512_expm1_ps
+_mm512_floor_ph
+_mm512_hypot_pd
+_mm512_hypot_ph
+_mm512_hypot_ps
+_mm512_invsqrt_pd
+_mm512_invsqrt_ph
+_mm512_invsqrt_ps
+_mm512_log10_pd
+_mm512_log10_ph
+_mm512_log10_ps
+_mm512_log1p_pd
+_mm512_log1p_ph
+_mm512_log1p_ps
+_mm512_log2_pd
+_mm512_log2_ph
+_mm512_log2_ps
+_mm512_log_pd
+_mm512_log_ph
+_mm512_log_ps
+_mm512_logb_pd
+_mm512_logb_ph
+_mm512_logb_ps
+_mm512_mask_acos_pd
+_mm512_mask_acos_ph
+_mm512_mask_acos_ps
+_mm512_mask_acosh_pd
+_mm512_mask_acosh_ph
+_mm512_mask_acosh_ps
+_mm512_mask_asin_pd
+_mm512_mask_asin_ph
+_mm512_mask_asin_ps
+_mm512_mask_asinh_pd
+_mm512_mask_asinh_ph
+_mm512_mask_asinh_ps
+_mm512_mask_atan2_pd
+_mm512_mask_atan2_ps
+_mm512_mask_atan_pd
+_mm512_mask_atan_ph
+_mm512_mask_atan_ph
+_mm512_mask_atanh_pd
+_mm512_mask_atanh_ph
+_mm512_mask_atanh_ps
+_mm512_mask_cbrt_pd
+_mm512_mask_cbrt_ph
+_mm512_mask_cbrt_ps
+_mm512_mask_cdfnorm_pd
+_mm512_mask_cdfnorm_ph
+_mm512_mask_cdfnorm_ps
+_mm512_mask_cdfnorminv_pd
+_mm512_mask_cdfnorminv_ph
+_mm512_mask_cdfnorminv_ps
+_mm512_mask_ceil_ph
+_mm512_mask_cos_pd
+_mm512_mask_cos_ph
+_mm512_mask_cos_ps
+_mm512_mask_cosd_pd
+_mm512_mask_cosd_ph
+_mm512_mask_cosd_ps
+_mm512_mask_cosh_pd
+_mm512_mask_cosh_ph
+_mm512_mask_cosh_ps
+_mm512_mask_atan_ps
+_mm512_cosd_pd
+_mm512_cos_ps
+_mm512_cos_ph
+_mm512_cos_pd
+_mm512_mask_div_epi32
+_mm512_mask_div_epu32
+_mm512_mask_erf_pd
+_mm512_mask_erf_ph
+_mm512_mask_erf_ps
+_mm512_mask_erfc_pd
+_mm512_mask_erfc_ph
+_mm512_mask_erfc_ps
+_mm512_mask_erfcinv_pd
+_mm512_mask_erfcinv_ph
+_mm512_mask_erfcinv_ps
+_mm512_mask_erfinv_pd
+_mm512_mask_erfinv_ph
+_mm512_mask_erfinv_ps
+_mm512_mask_exp10_pd
+_mm512_mask_exp10_ph
+_mm512_mask_exp10_ps
+_mm512_mask_exp2_pd
+_mm512_mask_exp2_ph
+_mm512_mask_exp2_ps
+_mm512_mask_exp_pd
+_mm512_mask_exp_ph
+_mm512_mask_exp_ps
+_mm512_mask_expm1_pd
+_mm512_mask_expm1_ph
+_mm512_mask_expm1_ps
+_mm512_mask_floor_ph
+_mm512_mask_hypot_pd
+_mm512_mask_hypot_ps
+_mm512_mask_invsqrt_pd
+_mm512_mask_invsqrt_ph
+_mm512_mask_invsqrt_ps
+_mm512_mask_log10_pd
+_mm512_mask_log10_ph
+_mm512_mask_log10_ps
+_mm512_mask_log1p_pd
+_mm512_mask_log1p_ph
+_mm512_mask_log1p_ps
+_mm512_mask_log2_pd
+_mm512_mask_log2_ph
+_mm512_mask_log2_ps
+_mm512_mask_log_pd
+_mm512_mask_log_ph
+_mm512_mask_log_ps
+_mm512_mask_logb_pd
+_mm512_mask_logb_ph
+_mm512_mask_logb_ps
+_mm512_mask_nearbyint_pd
+_mm512_mask_nearbyint_ph
+_mm512_mask_nearbyint_ps
+_mm512_mask_pow_pd
+_mm512_mask_pow_ps
+_mm512_mask_recip_pd
+_mm512_mask_recip_ph
+_mm512_mask_recip_ps
+_mm512_mask_rem_epi32
+_mm512_mask_rem_epu32
+_mm512_mask_rint_pd
+_mm512_mask_rint_ph
+_mm512_mask_rint_ps
+_mm512_mask_sin_pd
+_mm512_mask_sin_ph
+_mm512_mask_sin_ps
+_mm512_mask_sind_pd
+_mm512_mask_sind_ph
+_mm512_mask_sind_ps
+_mm512_mask_sinh_pd
+_mm512_mask_sinh_ph
+_mm512_mask_sinh_ps
+_mm512_mask_svml_round_pd
+_mm512_mask_svml_round_ph
+_mm512_mask_tan_pd
+_mm512_mask_tan_ph
+_mm512_mask_tan_ps
+_mm512_mask_tand_pd
+_mm512_mask_tand_ph
+_mm512_mask_tand_ps
+_mm512_mask_tanh_pd
+_mm512_mask_tanh_ph
+_mm512_mask_tanh_ps
+_mm512_mask_trunc_pd
+_mm512_mask_trunc_ph
+_mm512_mask_trunc_ps
+_mm512_nearbyint_pd
+_mm512_nearbyint_ph
+_mm512_nearbyint_ps
+_mm512_pow_pd
+_mm512_pow_ph
+_mm512_pow_ps
+_mm512_recip_pd
+_mm512_recip_ph
+_mm512_recip_ps
+_mm512_rem_epi16
+_mm512_rem_epi32
+_mm512_rem_epi64
+_mm512_rem_epi8
+_mm512_rem_epu16
+_mm512_rem_epu32
+_mm512_rem_epu64
+_mm512_rem_epu8
+_mm512_rint_pd
+_mm512_rint_ph
+_mm512_rint_ps
+_mm512_sin_pd
+_mm512_sin_ph
+_mm512_sin_ps
+_mm512_sind_pd
+_mm512_sind_ph
+_mm512_sind_ps
+_mm512_sinh_pd
+_mm512_sinh_ph
+_mm512_sinh_ps
+_mm512_svml_round_pd
+_mm512_svml_round_ph
+_mm512_tan_pd
+_mm512_tan_ph
+_mm512_tan_ps
+_mm512_tand_pd
+_mm512_tand_ph
+_mm512_tand_ps
+_mm512_tanh_pd
+_mm512_tanh_ph
+_mm512_tanh_ps
+_mm512_trunc_pd
+_mm512_trunc_ph
+_mm512_trunc_ps
+_mm_acos_pd
+_mm_acos_ph
+_mm_acos_ps
+_mm_acosh_pd
+_mm_acosh_ph
+_mm_acosh_ps
+_mm_asin_pd
+_mm_asin_ph
+_mm_asin_ps
+_mm_asinh_pd
+_mm_asinh_ph
+_mm_asinh_ps
+_mm_atan2_pd
+_mm_atan2_ph
+_mm_atan2_ps
+_mm_atan_pd
+_mm_atan_ph
+_mm_atan_ps
+_mm_atanh_pd
+_mm_atanh_ph
+_mm_atanh_ps
+_mm_cbrt_pd
+_mm_cbrt_ph
+_mm_cbrt_ps
+_mm_cdfnorm_pd
+_mm_cdfnorm_ph
+_mm_cdfnorm_ps
+_mm_cdfnorminv_pd
+_mm_cdfnorminv_ph
+_mm_cdfnorminv_ps
+_mm_cexp_ps
+_mm_clog_ps
+_mm_cos_pd
+_mm_cos_ph
+_mm_cos_ps
+_mm_cosd_pd
+_mm_cosd_ph
+_mm_cosd_ps
+_mm_cosh_pd
+_mm_cosh_ph
+_mm_cosh_ps
+_mm_csqrt_ps
+_mm_cvtsd_si64x
+_mm_cvtsi128_si64x
+_mm_cvtsi64x_sd
+_mm_cvtsi64x_si128
+_mm_cvttsd_si64x
+_mm_div_epi16
+_mm_div_epi32
+_mm_div_epi64
+_mm_div_epi8
+_mm_div_epu16
+_mm_div_epu32
+_mm_div_epu64
+_mm_div_epu8
+_mm_dpbssd_epi32
+_mm_dpbssds_epi32
+_mm_dpbsud_epi32
+_mm_dpbsuds_epi32
+_mm_dpbuud_epi32
+_mm_dpbuuds_epi32
+_mm_dpwsud_epi32
+_mm_dpwsuds_epi32
+_mm_dpwusd_epi32
+_mm_dpwusds_epi32
+_mm_dpwuud_epi32
+_mm_dpwuuds_epi32
+_mm_erf_pd
+_mm_erf_ph
+_mm_erf_ps
+_mm_erfc_pd
+_mm_erfc_ph
+_mm_erfc_ps
+_mm_erfcinv_pd
+_mm_erfcinv_ph
+_mm_erfcinv_ps
+_mm_erfinv_pd
+_mm_erfinv_ph
+_mm_erfinv_ps
+_mm_exp10_pd
+_mm_exp10_ph
+_mm_exp10_ps
+_mm_exp2_pd
+_mm_exp2_ph
+_mm_exp2_ps
+_mm_exp_pd
+_mm_exp_ph
+_mm_exp_ps
+_mm_expm1_pd
+_mm_expm1_ph
+_mm_expm1_ps
+_mm_hypot_pd
+_mm_hypot_ph
+_mm_hypot_ps
+_mm_idiv_epi32
+_mm_invcbrt_pd
+_mm_invcbrt_ph
+_mm_invcbrt_ps
+_mm_invsqrt_pd
+_mm_invsqrt_ph
+_mm_invsqrt_ps
+_mm_irem_epi32
+_mm_log10_pd
+_mm_log10_ph
+_mm_log10_ps
+_mm_log1p_pd
+_mm_log1p_ph
+_mm_log1p_ps
+_mm_log2_pd
+_mm_log2_ph
+_mm_log2_ps
+_mm_log_pd
+_mm_log_ph
+_mm_log_ps
+_mm_logb_pd
+_mm_logb_ph
+_mm_logb_ps
+_mm_madd52hi_avx_epu64
+_mm_madd52lo_avx_epu64
+_mm_mask_reduce_add_epi16
+_mm_mask_reduce_add_epi8
+_mm_mask_reduce_and_epi16
+_mm_mask_reduce_and_epi8
+_mm_mask_reduce_max_epi16
+_mm_mask_reduce_max_epi8
+_mm_mask_reduce_max_epu16
+_mm_mask_reduce_max_epu8
+_mm_mask_reduce_min_epi16
+_mm_mask_reduce_min_epi8
+_mm_mask_reduce_min_epu16
+_mm_mask_reduce_min_epu8
+_mm_mask_reduce_mul_epi16
+_mm_mask_reduce_mul_epi8
+_mm_mask_reduce_or_epi16
+_mm_mask_reduce_or_epi8
+_mm_pow_pd
+_mm_pow_ph
+_mm_pow_ps
+_mm_reduce_add_epi16
+_mm_reduce_add_epi8
+_mm_reduce_and_epi16
+_mm_reduce_and_epi8
+_mm_reduce_max_epi16
+_mm_reduce_max_epi8
+_mm_reduce_max_epu16
+_mm_reduce_max_epu8
+_mm_reduce_min_epi16
+_mm_reduce_min_epi8
+_mm_reduce_min_epu16
+_mm_reduce_min_epu8
+_mm_reduce_mul_epi16
+_mm_reduce_mul_epi8
+_mm_reduce_or_epi16
+_mm_reduce_or_epi8
+_mm_rem_epi16
+_mm_rem_epi32
+_mm_rem_epi64
+_mm_rem_epi8
+_mm_rem_epu16
+_mm_rem_epu32
+_mm_rem_epu64
+_mm_rem_epu8
+_mm_sin_pd
+_mm_sin_ph
+_mm_sin_ps
+_mm_sind_pd
+_mm_sind_ph
+_mm_sind_ps
+_mm_sinh_pd
+_mm_sinh_ph
+_mm_sinh_ps
+_mm_sm3msg1_epi32
+_mm_sm3msg2_epi32
+_mm_sm3rnds2_epi32
+_mm_sm4key4_epi32
+_mm_sm4rnds4_epi32
+_mm_svml_ceil_pd
+_mm_svml_ceil_ph
+_mm_svml_ceil_ps
+_mm_svml_floor_pd
+_mm_svml_floor_ph
+_mm_svml_floor_ps
+_mm_svml_round_pd
+_mm_svml_round_ph
+_mm_svml_round_ps
+_mm_svml_sqrt_pd
+_mm_svml_sqrt_ph
+_mm_svml_sqrt_ps
+_mm_tan_pd
+_mm_tan_ph
+_mm_tan_ps
+_mm_tand_pd
+_mm_tand_ph
+_mm_tand_ps
+_mm_tanh_pd
+_mm_tanh_ph
+_mm_tanh_ps
+_mm_trunc_pd
+_mm_trunc_ph
+_mm_trunc_ps
+_mm_udiv_epi32
+_mm_urem_epi32
+_popcnt32
+_popcnt64
+_rdpmc
+_rotl
+_rotl64
+_rotr
+_rotr64
+_rotwl
+_rotwr
+_urdmsr
+
+# Cannot find value in this scope (in Rust testfiles)
+_mm512_set1_pch
+_mm_abs_pi16
+_mm_abs_pi32
+_mm_abs_pi8
+_mm_add_pi16
+_mm_add_pi32
+_mm_add_pi8
+_mm_add_si64
+_mm_adds_pi16
+_mm_adds_pi8
+_mm_adds_pu16
+_mm_adds_pu8
+_mm_alignr_pi8
+_mm_and_si64
+_mm_andnot_si64
+_mm_avg_pu16
+_mm_avg_pu8
+_mm_cmpeq_pi16
+_mm_cmpeq_pi32
+_mm_cmpeq_pi8
+_mm_cmpgt_pi16
+_mm_cmpgt_pi32
+_mm_cmpgt_pi8
+_mm_cvt_pi2ps
+_mm_cvt_ps2pi
+_mm_cvtm64_si64
+_mm_cvtpd_pi32
+_mm_cvtpi16_ps
+_mm_cvtpi32_pd
+_mm_cvtpi32_ps
+_mm_cvtpi32x2_ps
+_mm_cvtpi8_ps
+_mm_cvtps_pi16
+_mm_cvtps_pi32
+_mm_cvtps_pi8
+_mm_cvtpu16_ps
+_mm_cvtpu8_ps
+_mm_cvtsi32_si64
+_mm_cvtsi64_m64
+_mm_cvtsi64_si32
+_mm_cvtt_ps2pi
+_mm_cvttpd_pi32
+_mm512_cbrt_pd
+_mm512_cbrt_ph
+_mm512_cbrt_ps
+_mm512_cdfnorm_pd
+_mm512_cdfnorm_ph
+_mm512_cdfnorm_ps
+_mm512_cdfnorminv_pd
+_mm512_cdfnorminv_ph
+_mm512_cdfnorminv_ps
+_mm512_ceil_pd
+_mm512_ceil_ph
+_mm512_ceil_ps
+_mm512_floor_pd
+_mm512_floor_ps
+_mm512_mask_ceil_pd
+_mm512_mask_ceil_ps
+_mm_max_pi16
+_mm_max_pu8
+_mm_min_pi16
+_mm_min_pu8
+_mm_movemask_pi8
+_mm_movepi64_pi64
+_mm_movpi64_epi64
+_mm_mul_su32
+_mm_mulhi_pi16
+_mm_mulhi_pu16
+_mm_mulhrs_pi16
+_mm_mullo_pi16
+_mm_or_si64
+_mm_packs_pi16
+_mm_packs_pi32
+_mm_packs_pu16
+_mm_popcnt_u32
+_mm_popcnt_u64
+_mm_sad_pu8
+_mm_set1_epi64
+_mm_set1_pch
+_mm_set1_pi16
+_mm_set1_pi32
+_mm_set1_pi8
+_mm_set_epi64
+_mm_set_pi16
+_mm_set_pi32
+_mm_set_pi8
+_mm_setr_epi64
+_mm_setr_pi16
+_mm_setr_pi32
+_mm_setr_pi8
+_mm_shuffle_pi16
+_mm_shuffle_pi8
+_mm_sign_pi16
+_mm_sign_pi32
+_mm_sign_pi8
+_mm_sll_pi16
+_mm_sll_pi32
+_mm_sll_si64
+_mm_slli_pi16
+_mm_slli_pi32
+_mm_slli_si64
+_mm_sra_pi16
+_mm_sra_pi32
+_mm_srai_pi16
+_mm_srai_pi32
+_mm_srl_pi16
+_mm_srl_pi32
+_mm_srl_si64
+_mm_srli_pi16
+_mm_srli_pi32
+_mm_srli_si64
+_mm_sub_pi16
+_mm_sub_pi32
+_mm_sub_pi8
+_mm_sub_si64
+_mm_subs_pi16
+_mm_subs_pi8
+_mm_subs_pu16
+_mm_subs_pu8
+_mm_unpackhi_pi16
+_mm_unpackhi_pi32
+_mm_unpackhi_pi8
+_mm_unpacklo_pi16
+_mm_unpacklo_pi32
+_mm_unpacklo_pi8
+_mm_xor_si64
+_mm256_pow_pd
+_mm256_pow_ph
+_mm256_pow_ps
+_mm256_rem_epi16
+_mm256_rem_epi32
+_mm256_rem_epi64
+_mm256_rem_epi8
+_mm256_rem_epu16
+_mm256_rem_epu32
+_mm256_rem_epu64
+_mm256_rem_epu8
+_mm256_set1_pch
+_mm256_sin_pd
+_mm256_sin_ph
+_mm256_sin_ps
+_mm256_sind_pd
+_mm256_sind_ph
+_mm256_sind_ps
+_mm256_sinh_pd
+_mm256_sinh_ph
+_mm256_sinh_ps
+_mm256_svml_ceil_pd
+_mm256_svml_ceil_ph
+_mm256_svml_ceil_ps
+_mm256_svml_floor_pd
+_mm256_svml_floor_ph
+_mm256_svml_floor_ps
+_mm256_svml_round_pd
+_mm256_svml_round_ph
+_mm256_svml_round_ps
+_mm256_svml_sqrt_pd
+_mm256_svml_sqrt_ph
+_mm256_svml_sqrt_ps
+_mm256_tan_pd
+_mm256_tan_ph
+_mm256_tan_ps
+_mm256_tand_pd
+_mm256_tand_ph
+_mm256_tand_ps
+_mm256_tanh_pd
+_mm256_tanh_ph
+_mm256_tanh_ps
+_mm256_trunc_pd
+_mm256_trunc_ph
+_mm256_trunc_ps
+_mm256_udiv_epi32
+_mm256_urem_epi32
+_mm512_acos_pd
+_mm512_acos_ph
+_mm512_acos_ps
+_mm512_acosh_pd
+_mm512_acosh_ph
+_mm512_acosh_ps
+_mm_cvttps_pi32
+_mm_extract_pi16
+_mm_hadd_pi16
+_mm_hadd_pi32
+_mm_hadds_pi16
+_mm_hsub_pi16
+_mm_hsub_pi32
+_mm_hsubs_pi16
+_mm_insert_pi16
+_mm_madd_pi16
+_mm_maddubs_pi16
+_mm512_asin_pd
+_mm512_asin_ph
+_mm512_asin_ps
+_mm512_asinh_pd
+_mm512_asinh_ph
+_mm512_asinh_ps
+_mm512_atan2_pd
+_mm512_atan2_ph
+_mm512_atan2_ps
+_mm512_atan_pd
+_mm512_atan_ph
+_mm512_atan_ps
+_mm512_atanh_pd
+_mm512_atanh_ph
+_mm512_atanh_ps
+_cvtsh_ss
+_cvtss_sh
+_m_from_int
+_m_from_int64
+_m_packssdw
+_m_packsswb
+_m_packuswb
+_m_paddb
+_m_paddd
+_m_paddsb
+_m_paddsw
+_m_paddusb
+_m_paddusw
+_m_paddw
+_m_pand
+_m_pandn
+_m_pavgb
+_m_pavgw
+_m_pcmpeqb
+_m_pcmpeqd
+_m_pcmpeqw
+_m_pcmpgtb
+_m_pcmpgtd
+_m_pcmpgtw
+_m_pextrw
+_m_pinsrw
+_m_pmaddwd
+_m_pmaxsw
+_m_pmaxub
+_m_pminsw
+_m_pminub
+_m_pmovmskb
+_m_pmulhuw
+_m_pmulhw
+_m_pmullw
+_m_por
+_m_psadbw
+_m_pshufw
+_m_pslld
+_m_pslldi
+_m_psllq
+_m_psllqi
+_m_psllw
+_m_psllwi
+_m_psrad
+_m_psradi
+_m_psraw
+_m_psrawi
+_m_psrld
+_m_psrldi
+_m_psrlq
+_m_psrlqi
+_m_psrlw
+_m_psrlwi
+_m_psubb
+_m_psubd
+_m_psubsb
+_m_psubsw
+_m_psubusb
+_m_psubusw
+_m_psubw
+_m_punpckhbw
+_m_punpckhdq
+_m_punpckhwd
+_m_punpcklbw
+_m_punpckldq
+_m_punpcklwd
+_m_pxor
+_m_to_int
+_m_to_int64
+_mm512_mask_floor_pd
+_mm512_mask_floor_ps
+
+# SDE ERROR: Cannot execute XGETBV with ECX != 0
+_xgetbv
+
+# Miscellaneous issues that can be fixed first
+_kshiftli_mask16
+_kshiftli_mask32
+_kshiftli_mask64
+_kshiftli_mask8
+_kshiftri_mask16
+_kshiftri_mask32
+_kshiftri_mask64
+_kshiftri_mask8
+_mm256_castsi128_si256
+_mm256_extract_epi16
+_mm256_extract_epi8
+_mm512_castsi128_si512
+_mm512_castsi256_si512
+# _mm512_conj_pch
+_mm512_mask_reduce_max_pd
+_mm512_mask_reduce_max_ps
+_mm512_mask_reduce_min_pd
+_mm512_mask_reduce_min_ps
+_mm_comineq_sh
+_mm_extract_epi16
+_mm_extract_epi8
+_mm_mask_cvtepi16_epi8
+_mm_mask_cvtpd_epi32
+_mm_mask_cvtpd_ps
+_mm_ucomineq_sh
--- a/library/stdarch/crates/intrinsic-test/src/arm/config.rs
+++ b/library/stdarch/crates/intrinsic-test/src/arm/config.rs
@ -3,13 +3,24 @@ pub const NOTICE: &str = "\
 // test are derived from a JSON specification, published under the same license as the
 // `intrinsic-test` crate.\n";

-pub const POLY128_OSTREAM_DECL: &str = r#"
+pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#"
 #ifdef __aarch64__
 std::ostream& operator<<(std::ostream& os, poly128_t value);
 #endif
+
+std::ostream& operator<<(std::ostream& os, float16_t value);
+std::ostream& operator<<(std::ostream& os, uint8_t value);
+
+// T1 is the `To` type, T2 is the `From` type
+template<typename T1, typename T2> T1 cast(T2 x) {
+  static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
+  T1 ret{};
+  memcpy(&ret, &x, sizeof(T1));
+  return ret;
+}
 "#;

-pub const POLY128_OSTREAM_DEF: &str = r#"
+pub const PLATFORM_C_DEFINITIONS: &str = r#"
 #ifdef __aarch64__
 std::ostream& operator<<(std::ostream& os, poly128_t value) {
    std::stringstream temp;
@ -23,11 +34,26 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) {
    os << res;
    return os;
 }
+
 #endif
+
+std::ostream& operator<<(std::ostream& os, float16_t value) {
+    uint16_t temp = 0;
+    memcpy(&temp, &value, sizeof(float16_t));
+    std::stringstream ss;
+    ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
+    os << ss.str();
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, uint8_t value) {
+    os << (unsigned int) value;
+    return os;
+}
 "#;

 // Format f16 values (and vectors containing them) in a way that is consistent with C.
-pub const F16_FORMATTING_DEF: &str = r#"
+pub const PLATFORM_RUST_DEFINITIONS: &str = r#"
 /// Used to continue `Debug`ging SIMD types as `MySimd(1, 2, 3, 4)`, as they
 /// were before moving to array-based simd.
 #[inline]
@ -113,7 +139,7 @@ impl DebugHexF16 for float16x8x4_t {
 }
 "#;

-pub const AARCH_CONFIGURATIONS: &str = r#"
+pub const PLATFORM_RUST_CFGS: &str = r#"
 #![cfg_attr(target_arch = "arm", feature(stdarch_arm_neon_intrinsics))]
 #![cfg_attr(target_arch = "arm", feature(stdarch_aarch32_crc32))]
 #![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_fcma))]
@ -121,12 +147,13 @@ pub const AARCH_CONFIGURATIONS: &str = r#"
 #![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_i8mm))]
 #![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sm4))]
 #![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_ftts))]
+#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_aarch64_jscvt))]
 #![feature(fmt_helpers_for_derive)]
 #![feature(stdarch_neon_f16)]

 #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
-use core::arch::aarch64::*;
+use core_arch::arch::aarch64::*;

 #[cfg(target_arch = "arm")]
-use core::arch::arm::*;
+use core_arch::arch::arm::*;
 "#;
--- a/library/stdarch/crates/intrinsic-test/src/arm/mod.rs
+++ b/library/stdarch/crates/intrinsic-test/src/arm/mod.rs
@ -32,11 +32,11 @@ impl SupportedArchitectureTest for ArmArchitectureTest {
    const NOTICE: &str = config::NOTICE;

    const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"];
-    const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF;
-    const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL;
+    const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS;
+    const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS;

-    const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF;
-    const PLATFORM_RUST_CFGS: &str = config::AARCH_CONFIGURATIONS;
+    const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS;
+    const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS;

    fn cpp_compilation(&self) -> Option<CppCompilation> {
        compile::build_cpp_compilation(&self.cli_options)
--- a/library/stdarch/crates/intrinsic-test/src/arm/types.rs
+++ b/library/stdarch/crates/intrinsic-test/src/arm/types.rs
@ -14,10 +14,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
                (None, None) => format!("{const_prefix}{prefix}{bit_len}_t"),
                (Some(simd), None) => format!("{prefix}{bit_len}x{simd}_t"),
                (Some(simd), Some(vec)) => format!("{prefix}{bit_len}x{simd}x{vec}_t"),
-                (None, Some(_)) => todo!("{:#?}", self), // Likely an invalid case
+                (None, Some(_)) => todo!("{self:#?}"), // Likely an invalid case
            }
        } else {
-            todo!("{:#?}", self)
+            todo!("{self:#?}")
        }
    }

@ -58,14 +58,14 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
                    // The ACLE doesn't support 64-bit polynomial loads on Armv7
                    // if armv7 and bl == 64, use "s", else "p"
                    TypeKind::Poly => if choose_workaround && *bl == 64 {"s"} else {"p"},
-                    x => todo!("get_load_function TypeKind: {:#?}", x),
+                    x => todo!("get_load_function TypeKind: {x:#?}"),
                },
                size = bl,
                quad = quad,
                len = vec_len.unwrap_or(1),
            )
        } else {
-            todo!("get_load_function IntrinsicType: {:#?}", self)
+            todo!("get_load_function IntrinsicType: {self:#?}")
        }
    }

@ -90,13 +90,13 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
                    TypeKind::Int(Sign::Signed) => "s",
                    TypeKind::Float => "f",
                    TypeKind::Poly => "p",
-                    x => todo!("get_load_function TypeKind: {:#?}", x),
+                    x => todo!("get_load_function TypeKind: {x:#?}"),
                },
                size = bl,
                quad = quad,
            )
        } else {
-            todo!("get_lane_function IntrinsicType: {:#?}", self)
+            todo!("get_lane_function IntrinsicType: {self:#?}")
        }
    }

@ -112,12 +112,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
                        ty = self.c_single_vector_type(),
                        lanes = (0..self.num_lanes())
                            .map(move |idx| -> std::string::String {
+                                let lane_fn = self.get_lane_function();
+                                let final_cast = self.generate_final_type_cast();
                                format!(
-                                    "{cast}{lane_fn}(__return_value.val[{vector}], {lane})",
-                                    cast = self.c_promotion(),
-                                    lane_fn = self.get_lane_function(),
-                                    lane = idx,
-                                    vector = vector,
+                                    "{final_cast}{lane_fn}(__return_value.val[{vector}], {idx})"
                                )
                            })
                            .collect::<Vec<_>>()
@ -129,12 +127,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
        } else if self.num_lanes() > 1 {
            (0..self.num_lanes())
                .map(|idx| -> std::string::String {
-                    format!(
-                        "{cast}{lane_fn}(__return_value, {lane})",
-                        cast = self.c_promotion(),
-                        lane_fn = self.get_lane_function(),
-                        lane = idx
-                    )
+                    let lane_fn = self.get_lane_function();
+                    let final_cast = self.generate_final_type_cast();
+                    format!("{final_cast}{lane_fn}(__return_value, {idx})")
                })
                .collect::<Vec<_>>()
                .join(r#" << ", " << "#)
@ -148,9 +143,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
                    TypeKind::Int(Sign::Signed) => format!("int{}_t", self.inner_size()),
                    TypeKind::Int(Sign::Unsigned) => format!("uint{}_t", self.inner_size()),
                    TypeKind::Poly => format!("poly{}_t", self.inner_size()),
-                    ty => todo!("print_result_c - Unknown type: {:#?}", ty),
+                    ty => todo!("print_result_c - Unknown type: {ty:#?}"),
                },
-                promote = self.c_promotion(),
+                promote = self.generate_final_type_cast(),
            )
        };

--- a/library/stdarch/crates/intrinsic-test/src/common/argument.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/argument.rs
@ -30,7 +30,12 @@ where
    }

    pub fn to_c_type(&self) -> String {
-        self.ty.c_type()
+        let prefix = if self.ty.constant { "const " } else { "" };
+        format!("{prefix}{}", self.ty.c_type())
+    }
+
+    pub fn generate_name(&self) -> String {
+        format!("{}_val", self.name)
    }

    pub fn is_simd(&self) -> bool {
@ -55,16 +60,22 @@ where
    }

    /// The name (e.g. "A_VALS" or "a_vals") for the array of possible test inputs.
-    fn rust_vals_array_name(&self) -> impl std::fmt::Display {
+    pub(crate) fn rust_vals_array_name(&self) -> impl std::fmt::Display {
        if self.ty.is_rust_vals_array_const() {
-            format!("{}_VALS", self.name.to_uppercase())
+            let loads = crate::common::gen_rust::PASSES;
+            format!(
+                "{}_{ty}_{load_size}",
+                self.name.to_uppercase(),
+                ty = self.ty.rust_scalar_type(),
+                load_size = self.ty.num_lanes() * self.ty.num_vectors() + loads - 1,
+            )
        } else {
            format!("{}_vals", self.name.to_lowercase())
        }
    }

    fn as_call_param_c(&self) -> String {
-        self.ty.as_call_param_c(&self.name)
+        self.ty.as_call_param_c(&self.generate_name())
    }
 }

@ -91,7 +102,7 @@ where
    pub fn as_call_param_rust(&self) -> String {
        self.iter()
            .filter(|a| !a.has_constraint())
-            .map(|arg| arg.name.clone())
+            .map(|arg| arg.generate_name() + " as _")
            .collect::<Vec<String>>()
            .join(", ")
    }
@ -106,11 +117,13 @@ where
        loads: u32,
    ) -> std::io::Result<()> {
        for arg in self.iter().filter(|&arg| !arg.has_constraint()) {
+            // Setting the variables on an aligned boundary to make it easier to pick
+            // functions (of a specific architecture) that would help load the values.
            writeln!(
                w,
-                "{indentation}const {ty} {name}_vals[] = {values};",
+                "{indentation}alignas(64) const {ty} {name}_vals[] = {values};",
                ty = arg.ty.c_scalar_type(),
-                name = arg.name,
+                name = arg.generate_name(),
                values = arg.ty.populate_random(indentation, loads, &Language::C)
            )?
        }
@ -127,20 +140,34 @@ where
        loads: u32,
    ) -> std::io::Result<()> {
        for arg in self.iter().filter(|&arg| !arg.has_constraint()) {
-            writeln!(
-                w,
-                "{indentation}{bind} {name}: [{ty}; {load_size}] = {values};",
-                bind = arg.rust_vals_array_binding(),
-                name = arg.rust_vals_array_name(),
-                ty = arg.ty.rust_scalar_type(),
-                load_size = arg.ty.num_lanes() * arg.ty.num_vectors() + loads - 1,
-                values = arg.ty.populate_random(indentation, loads, &Language::Rust)
-            )?
+            // Constants are defined globally.
+            if arg.ty.is_rust_vals_array_const() {
+                continue;
+            }
+
+            Self::gen_arg_rust(arg, w, indentation, loads)?;
        }

        Ok(())
    }

+    pub fn gen_arg_rust(
+        arg: &Argument<T>,
+        w: &mut impl std::io::Write,
+        indentation: Indentation,
+        loads: u32,
+    ) -> std::io::Result<()> {
+        writeln!(
+            w,
+            "{indentation}{bind} {name}: [{ty}; {load_size}] = {values};\n",
+            bind = arg.rust_vals_array_binding(),
+            name = arg.rust_vals_array_name(),
+            ty = arg.ty.rust_scalar_type(),
+            load_size = arg.ty.num_lanes() * arg.ty.num_vectors() + loads - 1,
+            values = arg.ty.populate_random(indentation, loads, &Language::Rust)
+        )
+    }
+
    /// Creates a line for each argument that initializes the argument from an array `[arg]_vals` at
    /// an offset `i` using a load intrinsic, in C.
    /// e.g `uint8x8_t a = vld1_u8(&a_vals[i]);`
@ -153,7 +180,7 @@ where
                format!(
                    "{indentation}{ty} {name} = cast<{ty}>({load}(&{name}_vals[i]));\n",
                    ty = arg.to_c_type(),
-                    name = arg.name,
+                    name = arg.generate_name(),
                    load = if arg.is_simd() {
                        arg.ty.get_load_function(Language::C)
                    } else {
@ -171,15 +198,16 @@ where
        self.iter()
            .filter(|&arg| !arg.has_constraint())
            .map(|arg| {
+                let load = if arg.is_simd() {
+                    arg.ty.get_load_function(Language::Rust)
+                } else {
+                    "*".to_string()
+                };
+                let typecast = if load.len() > 2 { "as _" } else { "" };
                format!(
-                    "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i));\n",
-                    name = arg.name,
+                    "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i){typecast});\n",
+                    name = arg.generate_name(),
                    vals_name = arg.rust_vals_array_name(),
-                    load = if arg.is_simd() {
-                        arg.ty.get_load_function(Language::Rust)
-                    } else {
-                        "*".to_string()
-                    },
                )
            })
            .collect()
--- a/library/stdarch/crates/intrinsic-test/src/common/cli.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/cli.rs
@ -54,6 +54,9 @@ pub struct Cli {
    /// Set the sysroot for the C++ compiler
    #[arg(long)]
    pub cxx_toolchain_dir: Option<String>,
+
+    #[arg(long, default_value_t = 100u8)]
+    pub sample_percentage: u8,
 }

 pub struct ProcessedCli {
@ -65,6 +68,7 @@ pub struct ProcessedCli {
    pub linker: Option<String>,
    pub cxx_toolchain_dir: Option<String>,
    pub skip: Vec<String>,
+    pub sample_percentage: u8,
 }

 impl ProcessedCli {
@ -74,6 +78,7 @@ impl ProcessedCli {
        let target = cli_options.target;
        let linker = cli_options.linker;
        let cxx_toolchain_dir = cli_options.cxx_toolchain_dir;
+        let sample_percentage = cli_options.sample_percentage;

        let skip = if let Some(filename) = cli_options.skip {
            let data = std::fs::read_to_string(&filename).expect("Failed to open file");
@ -108,6 +113,7 @@ impl ProcessedCli {
            cxx_toolchain_dir,
            skip,
            filename,
+            sample_percentage,
        }
    }
 }
--- a/library/stdarch/crates/intrinsic-test/src/common/compare.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/compare.rs
@ -14,15 +14,14 @@ pub fn compare_outputs(intrinsic_name_list: &Vec<String>, runner: &str, target:
    let intrinsics = intrinsic_name_list
        .par_iter()
        .filter_map(|intrinsic_name| {
-
            let c = runner_command(runner)
-                .arg("intrinsic-test-programs")
+                .arg("./intrinsic-test-programs")
                .arg(intrinsic_name)
                .current_dir("c_programs")
                .output();

            let rust = runner_command(runner)
-                .arg(format!("target/{target}/release/intrinsic-test-programs"))
+                .arg(format!("./target/{target}/release/intrinsic-test-programs"))
                .arg(intrinsic_name)
                .current_dir("rust_programs")
                .output();
--- a/library/stdarch/crates/intrinsic-test/src/common/compile_c.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/compile_c.rs
@ -119,7 +119,7 @@ impl CppCompilation {
        output: &str,
    ) -> std::io::Result<std::process::Output> {
        let mut cmd = clone_command(&self.0);
-        cmd.args([input, "-c", "-o", output]);
+        cmd.args([input, "-v", "-c", "-o", output]);
        cmd.output()
    }

--- a/library/stdarch/crates/intrinsic-test/src/common/gen_c.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/gen_c.rs
@ -6,6 +6,15 @@ use super::intrinsic_helpers::IntrinsicTypeDefinition;

 // The number of times each intrinsic will be called.
 const PASSES: u32 = 20;
+const COMMON_HEADERS: [&str; 7] = [
+    "iostream",
+    "string",
+    "cstring",
+    "iomanip",
+    "sstream",
+    "type_traits",
+    "cassert",
+];

 pub fn generate_c_test_loop<T: IntrinsicTypeDefinition + Sized>(
    w: &mut impl std::io::Write,
@ -47,7 +56,15 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>(
        let ty = current.ty.c_type();

        writeln!(w, "{indentation}{{")?;
-        writeln!(w, "{body_indentation}{ty} {} = {i};", current.name)?;
+
+        // TODO: Move to actually specifying the enum value
+        // instead of typecasting integers, for better clarity
+        // of generated code.
+        writeln!(
+            w,
+            "{body_indentation}const {ty} {} = ({ty}){i};",
+            current.generate_name()
+        )?;

        generate_c_constraint_blocks(
            w,
@ -99,32 +116,10 @@ pub fn write_mod_cpp<T: IntrinsicTypeDefinition>(
 ) -> std::io::Result<()> {
    write!(w, "{notice}")?;

-    for header in platform_headers {
+    for header in COMMON_HEADERS.iter().chain(platform_headers.iter()) {
        writeln!(w, "#include <{header}>")?;
    }

-    writeln!(
-        w,
-        r#"
-#include <iostream>
-#include <cstring>
-#include <iomanip>
-#include <sstream>
-
-template<typename T1, typename T2> T1 cast(T2 x) {{
-  static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
-  T1 ret{{}};
-  memcpy(&ret, &x, sizeof(T1));
-  return ret;
-}}
-
-std::ostream& operator<<(std::ostream& os, float16_t value);
-
-
-
-"#
-    )?;
-
    writeln!(w, "{}", forward_declarations)?;

    for intrinsic in intrinsics {
@ -137,33 +132,13 @@ std::ostream& operator<<(std::ostream& os, float16_t value);
 pub fn write_main_cpp<'a>(
    w: &mut impl std::io::Write,
    arch_specific_definitions: &str,
+    arch_specific_headers: &[&str],
    intrinsics: impl Iterator<Item = &'a str> + Clone,
 ) -> std::io::Result<()> {
-    writeln!(w, "#include <iostream>")?;
-    writeln!(w, "#include <string>")?;
-
-    for header in ["arm_neon.h", "arm_acle.h", "arm_fp16.h"] {
+    for header in COMMON_HEADERS.iter().chain(arch_specific_headers.iter()) {
        writeln!(w, "#include <{header}>")?;
    }

-    writeln!(
-        w,
-        r#"
-#include <cstring>
-#include <iomanip>
-#include <sstream>
-
-std::ostream& operator<<(std::ostream& os, float16_t value) {{
-    uint16_t temp = 0;
-    memcpy(&temp, &value, sizeof(float16_t));
-    std::stringstream ss;
-    ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
-    os << ss.str();
-    return os;
-}}
-"#
-    )?;
-
    // NOTE: It's assumed that this value contains the required `ifdef`s.
    writeln!(w, "{arch_specific_definitions }")?;

--- a/library/stdarch/crates/intrinsic-test/src/common/gen_rust.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/gen_rust.rs
@ -1,25 +1,31 @@
 use itertools::Itertools;
 use std::process::Command;

+use crate::common::argument::ArgumentList;
 use crate::common::intrinsic::Intrinsic;

 use super::indentation::Indentation;
-use super::intrinsic::format_f16_return_value;
 use super::intrinsic_helpers::IntrinsicTypeDefinition;

 // The number of times each intrinsic will be called.
-const PASSES: u32 = 20;
+pub(crate) const PASSES: u32 = 20;
+
+macro_rules! concatln {
+    ($($lines:expr),* $(,)?) => {
+        concat!($( $lines, "\n" ),*)
+    };
+}

 fn write_cargo_toml_header(w: &mut impl std::io::Write, name: &str) -> std::io::Result<()> {
    writeln!(
        w,
-        concat!(
-            "[package]\n",
-            "name = \"{name}\"\n",
-            "version = \"{version}\"\n",
-            "authors = [{authors}]\n",
-            "license = \"{license}\"\n",
-            "edition = \"2018\"\n",
+        concatln!(
+            "[package]",
+            "name = \"{name}\"",
+            "version = \"{version}\"",
+            "authors = [{authors}]",
+            "license = \"{license}\"",
+            "edition = \"2018\"",
        ),
        name = name,
        version = env!("CARGO_PKG_VERSION"),
@ -37,6 +43,7 @@ pub fn write_bin_cargo_toml(
    write_cargo_toml_header(w, "intrinsic-test-programs")?;

    writeln!(w, "[dependencies]")?;
+    writeln!(w, "core_arch = {{ path = \"../crates/core_arch\" }}")?;

    for i in 0..module_count {
        writeln!(w, "mod_{i} = {{ path = \"mod_{i}/\" }}")?;
@ -118,6 +125,20 @@ pub fn write_lib_rs<T: IntrinsicTypeDefinition>(

    writeln!(w, "{definitions}")?;

+    let mut seen = std::collections::HashSet::new();
+
+    for intrinsic in intrinsics {
+        for arg in &intrinsic.arguments.args {
+            if !arg.has_constraint() && arg.ty.is_rust_vals_array_const() {
+                let name = arg.rust_vals_array_name().to_string();
+
+                if seen.insert(name) {
+                    ArgumentList::gen_arg_rust(arg, w, Indentation::default(), PASSES)?;
+                }
+            }
+        }
+    }
+
    for intrinsic in intrinsics {
        crate::common::gen_rust::create_rust_test_module(w, intrinsic)?;
    }
@ -190,7 +211,7 @@ pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
    w: &mut impl std::io::Write,
    intrinsic: &Intrinsic<T>,
    indentation: Indentation,
-    specializations: &[Vec<u8>],
+    specializations: &[Vec<i32>],
    passes: u32,
 ) -> std::io::Result<()> {
    let intrinsic_name = &intrinsic.name;
@ -232,30 +253,30 @@ pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
        }
    }

-    let return_value = format_f16_return_value(intrinsic);
-    let indentation2 = indentation.nested();
-    let indentation3 = indentation2.nested();
-    writeln!(
+    write!(
        w,
-        "\
-            for (id, f) in specializations {{\n\
-                for i in 0..{passes} {{\n\
-                    unsafe {{\n\
-                        {loaded_args}\
-                        let __return_value = f({args});\n\
-                        println!(\"Result {{id}}-{{}}: {{:?}}\", i + 1, {return_value});\n\
-                    }}\n\
-                }}\n\
-            }}",
-        loaded_args = intrinsic.arguments.load_values_rust(indentation3),
+        concatln!(
+            "    for (id, f) in specializations {{",
+            "        for i in 0..{passes} {{",
+            "            unsafe {{",
+            "{loaded_args}",
+            "                let __return_value = f({args});",
+            "                println!(\"Result {{id}}-{{}}: {{:?}}\", i + 1, {return_value});",
+            "            }}",
+            "        }}",
+            "    }}",
+        ),
+        loaded_args = intrinsic.arguments.load_values_rust(indentation.nest_by(4)),
        args = intrinsic.arguments.as_call_param_rust(),
+        return_value = intrinsic.results.print_result_rust(),
+        passes = passes,
    )
 }

 /// Generate the specializations (unique sequences of const-generic arguments) for this intrinsic.
 fn generate_rust_specializations(
    constraints: &mut impl Iterator<Item = impl Iterator<Item = i64>>,
-) -> Vec<Vec<u8>> {
+) -> Vec<Vec<i32>> {
    let mut specializations = vec![vec![]];

    for constraint in constraints {
@ -263,7 +284,7 @@ fn generate_rust_specializations(
            .flat_map(|right| {
                specializations.iter().map(move |left| {
                    let mut left = left.clone();
-                    left.push(u8::try_from(right).unwrap());
+                    left.push(i32::try_from(right).unwrap());
                    left
                })
            })
--- a/library/stdarch/crates/intrinsic-test/src/common/indentation.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/indentation.rs
@ -10,6 +10,10 @@ impl Indentation {
    pub fn nested(self) -> Self {
        Self(self.0 + 1)
    }
+
+    pub fn nest_by(&self, additional_levels: u32) -> Self {
+        Self(self.0 + additional_levels)
+    }
 }

 impl std::fmt::Display for Indentation {
--- a/library/stdarch/crates/intrinsic-test/src/common/intrinsic.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/intrinsic.rs
@ -1,5 +1,5 @@
 use super::argument::ArgumentList;
-use super::intrinsic_helpers::{IntrinsicTypeDefinition, TypeKind};
+use super::intrinsic_helpers::IntrinsicTypeDefinition;

 /// An intrinsic
 #[derive(Debug, PartialEq, Clone)]
@ -16,17 +16,3 @@ pub struct Intrinsic<T: IntrinsicTypeDefinition> {
    /// Any architecture-specific tags.
    pub arch_tags: Vec<String>,
 }
-
-pub fn format_f16_return_value<T: IntrinsicTypeDefinition>(intrinsic: &Intrinsic<T>) -> String {
-    // the `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses
-    // a string representation of the output value to compare. In C, f16 values are currently printed
-    // as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print
-    // them as decimal floating point values. To keep the intrinsics tests working, for now, format
-    // vectors containing f16 values like C prints them.
-    let return_value = match intrinsic.results.kind() {
-        TypeKind::Float if intrinsic.results.inner_size() == 16 => "debug_f16(__return_value)",
-        _ => "format_args!(\"{__return_value:.150?}\")",
-    };
-
-    String::from(return_value)
-}
--- a/library/stdarch/crates/intrinsic-test/src/common/intrinsic_helpers.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/intrinsic_helpers.rs
@ -1,3 +1,4 @@
+use std::cmp;
 use std::fmt;
 use std::ops::Deref;
 use std::str::FromStr;
@ -75,9 +76,11 @@ impl TypeKind {
            Self::Float => "float",
            Self::Int(Sign::Signed) => "int",
            Self::Int(Sign::Unsigned) => "uint",
+            Self::Mask => "uint",
            Self::Poly => "poly",
            Self::Char(Sign::Signed) => "char",
-            _ => unreachable!("Not used: {:#?}", self),
+            Self::Vector => "int",
+            _ => unreachable!("Not used: {self:#?}"),
        }
    }

@ -91,7 +94,7 @@ impl TypeKind {
            Self::Poly => "u",
            Self::Char(Sign::Unsigned) => "u",
            Self::Char(Sign::Signed) => "i",
-            _ => unreachable!("Unused type kind: {:#?}", self),
+            _ => unreachable!("Unused type kind: {self:#?}"),
        }
    }
 }
@ -129,9 +132,9 @@ impl IntrinsicType {

    pub fn inner_size(&self) -> u32 {
        if let Some(bl) = self.bit_len {
-            bl
+            cmp::max(bl, 8)
        } else {
-            unreachable!("")
+            unreachable!("{self:#?}")
        }
    }

@ -154,6 +157,7 @@ impl IntrinsicType {
    pub fn c_scalar_type(&self) -> String {
        match self.kind() {
            TypeKind::Char(_) => String::from("char"),
+            TypeKind::Vector => String::from("int32_t"),
            _ => format!(
                "{prefix}{bits}_t",
                prefix = self.kind().c_prefix(),
@ -162,14 +166,6 @@ impl IntrinsicType {
        }
    }

-    pub fn rust_scalar_type(&self) -> String {
-        format!(
-            "{prefix}{bits}",
-            prefix = self.kind().rust_prefix(),
-            bits = self.inner_size()
-        )
-    }
-
    pub fn c_promotion(&self) -> &str {
        match *self {
            IntrinsicType {
@ -177,9 +173,9 @@ impl IntrinsicType {
                bit_len: Some(8),
                ..
            } => match kind {
-                TypeKind::Int(Sign::Signed) => "(int)",
-                TypeKind::Int(Sign::Unsigned) => "(unsigned int)",
-                TypeKind::Poly => "(unsigned int)(uint8_t)",
+                TypeKind::Int(Sign::Signed) => "int",
+                TypeKind::Int(Sign::Unsigned) => "unsigned int",
+                TypeKind::Poly => "uint8_t",
                _ => "",
            },
            IntrinsicType {
@ -188,9 +184,9 @@ impl IntrinsicType {
                ..
            } => match bit_len {
                8 => unreachable!("handled above"),
-                16 => "(uint16_t)",
-                32 => "(uint32_t)",
-                64 => "(uint64_t)",
+                16 => "uint16_t",
+                32 => "uint32_t",
+                64 => "uint64_t",
                128 => "",
                _ => panic!("invalid bit_len"),
            },
@ -199,16 +195,16 @@ impl IntrinsicType {
                bit_len: Some(bit_len),
                ..
            } => match bit_len {
-                16 => "(float16_t)",
-                32 => "(float)",
-                64 => "(double)",
+                16 => "float16_t",
+                32 => "float",
+                64 => "double",
                128 => "",
                _ => panic!("invalid bit_len"),
            },
            IntrinsicType {
                kind: TypeKind::Char(_),
                ..
-            } => "(char)",
+            } => "char",
            _ => "",
        }
    }
@ -221,15 +217,16 @@ impl IntrinsicType {
    ) -> String {
        match self {
            IntrinsicType {
-                bit_len: Some(bit_len @ (8 | 16 | 32 | 64)),
-                kind: kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_)),
+                bit_len: Some(bit_len @ (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 16 | 32 | 64)),
+                kind:
+                    kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_) | TypeKind::Mask),
                simd_len,
                vec_len,
                ..
            } => {
                let (prefix, suffix) = match language {
-                    Language::Rust => ("[", "]"),
-                    Language::C => ("{", "}"),
+                    Language::Rust => ('[', ']'),
+                    Language::C => ('{', '}'),
                };
                let body_indentation = indentation.nested();
                format!(
@ -265,12 +262,12 @@ impl IntrinsicType {
                ..
            } => {
                let (prefix, cast_prefix, cast_suffix, suffix) = match (language, bit_len) {
-                    (&Language::Rust, 16) => ("[", "f16::from_bits(", ")", "]"),
-                    (&Language::Rust, 32) => ("[", "f32::from_bits(", ")", "]"),
-                    (&Language::Rust, 64) => ("[", "f64::from_bits(", ")", "]"),
-                    (&Language::C, 16) => ("{", "cast<float16_t, uint16_t>(", ")", "}"),
-                    (&Language::C, 32) => ("{", "cast<float, uint32_t>(", ")", "}"),
-                    (&Language::C, 64) => ("{", "cast<double, uint64_t>(", ")", "}"),
+                    (&Language::Rust, 16) => ('[', "f16::from_bits(", ")", ']'),
+                    (&Language::Rust, 32) => ('[', "f32::from_bits(", ")", ']'),
+                    (&Language::Rust, 64) => ('[', "f64::from_bits(", ")", ']'),
+                    (&Language::C, 16) => ('{', "cast<float16_t, uint16_t>(", ")", '}'),
+                    (&Language::C, 32) => ('{', "cast<float, uint32_t>(", ")", '}'),
+                    (&Language::C, 64) => ('{', "cast<double, uint64_t>(", ")", '}'),
                    _ => unreachable!(),
                };
                format!(
@ -283,7 +280,44 @@ impl IntrinsicType {
                        )))
                )
            }
-            _ => unimplemented!("populate random: {:#?}", self),
+            IntrinsicType {
+                kind: TypeKind::Vector,
+                bit_len: Some(128 | 256 | 512),
+                simd_len,
+                vec_len,
+                ..
+            } => {
+                let (prefix, suffix) = match language {
+                    Language::Rust => ('[', ']'),
+                    Language::C => ('{', '}'),
+                };
+                let body_indentation = indentation.nested();
+                let effective_bit_len = 32;
+                format!(
+                    "{prefix}\n{body}\n{indentation}{suffix}",
+                    body = (0..(vec_len.unwrap_or(1) * simd_len.unwrap_or(1) + loads - 1))
+                        .format_with(",\n", |i, fmt| {
+                            let src = value_for_array(effective_bit_len, i);
+                            assert!(src == 0 || src.ilog2() < effective_bit_len);
+                            if (src >> (effective_bit_len - 1)) != 0 {
+                                // `src` is a two's complement representation of a negative value.
+                                let mask = !0u64 >> (64 - effective_bit_len);
+                                let ones_compl = src ^ mask;
+                                let twos_compl = ones_compl + 1;
+                                if (twos_compl == src) && (language == &Language::C) {
+                                    // `src` is INT*_MIN. C requires `-0x7fffffff - 1` to avoid
+                                    // undefined literal overflow behaviour.
+                                    fmt(&format_args!("{body_indentation}-{ones_compl:#x} - 1"))
+                                } else {
+                                    fmt(&format_args!("{body_indentation}-{twos_compl:#x}"))
+                                }
+                            } else {
+                                fmt(&format_args!("{body_indentation}{src:#x}"))
+                            }
+                        })
+                )
+            }
+            _ => unimplemented!("populate random: {self:#?}"),
        }
    }

@ -298,7 +332,7 @@ impl IntrinsicType {
                kind: TypeKind::Int(_) | TypeKind::Poly,
                ..
            } => true,
-            _ => unimplemented!(),
+            _ => true,
        }
    }

@ -330,4 +364,40 @@ pub trait IntrinsicTypeDefinition: Deref<Target = IntrinsicType> {
    /// rust debug output format for the return type. The generated line assumes
    /// there is an int i in scope which is the current pass number.
    fn print_result_c(&self, indentation: Indentation, additional: &str) -> String;
+
+    /// Generates a std::cout for the intrinsics results that will match the
+    /// rust debug output format for the return type. The generated line assumes
+    /// there is an int i in scope which is the current pass number.
+    ///
+    /// The `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses
+    /// a string representation of the output value to compare. In C, f16 values are currently printed
+    /// as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print
+    /// them as decimal floating point values. To keep the intrinsics tests working, for now, format
+    /// vectors containing f16 values like C prints them.
+    fn print_result_rust(&self) -> String {
+        let return_value = match self.kind() {
+            TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)",
+            _ => "format_args!(\"{__return_value:.150?}\")",
+        };
+
+        String::from(return_value)
+    }
+
+    /// To enable architecture-specific logic
+    fn rust_scalar_type(&self) -> String {
+        format!(
+            "{prefix}{bits}",
+            prefix = self.kind().rust_prefix(),
+            bits = self.inner_size()
+        )
+    }
+
+    fn generate_final_type_cast(&self) -> String {
+        let type_data = self.c_promotion();
+        if type_data.len() > 2 {
+            format!("({type_data})")
+        } else {
+            String::new()
+        }
+    }
 }
--- a/library/stdarch/crates/intrinsic-test/src/common/mod.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/mod.rs
@ -49,7 +49,7 @@ pub trait SupportedArchitectureTest {
    fn cpp_compilation(&self) -> Option<CppCompilation>;

    fn build_c_file(&self) -> bool {
-        let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len());
+        let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400);

        let cpp_compiler_wrapped = self.cpp_compilation();

@ -60,34 +60,42 @@ pub trait SupportedArchitectureTest {
            .map(|(i, chunk)| {
                let c_filename = format!("c_programs/mod_{i}.cpp");
                let mut file = File::create(&c_filename).unwrap();
-                write_mod_cpp(
+                let mod_file_write_result = write_mod_cpp(
                    &mut file,
                    Self::NOTICE,
                    Self::PLATFORM_C_HEADERS,
                    Self::PLATFORM_C_FORWARD_DECLARATIONS,
                    chunk,
-                )
-                .unwrap();
+                );
+
+                if let Err(error) = mod_file_write_result {
+                    return Err(format!("Error writing to mod_{i}.cpp: {error:?}"));
+                }

                // compile this cpp file into a .o file.
                //
                // This is done because `cpp_compiler_wrapped` is None when
                // the --generate-only flag is passed
+                trace!("compiling mod_{i}.cpp");
                if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() {
-                    let output = cpp_compiler
-                        .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?;
-                    assert!(output.status.success(), "{output:?}");
-                }
+                    let compile_output = cpp_compiler
+                        .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"));

+                    trace!("finished compiling mod_{i}.cpp");
+                    if let Err(compile_error) = compile_output {
+                        return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}"));
+                    }
+                }
                Ok(())
            })
-            .collect::<Result<(), std::io::Error>>()
+            .collect::<Result<(), String>>()
            .unwrap();

        let mut file = File::create("c_programs/main.cpp").unwrap();
        write_main_cpp(
            &mut file,
            Self::PLATFORM_C_DEFINITIONS,
+            Self::PLATFORM_C_HEADERS,
            self.intrinsics().iter().map(|i| i.name.as_str()),
        )
        .unwrap();
@ -96,7 +104,7 @@ pub trait SupportedArchitectureTest {
        // the --generate-only flag is passed
        if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() {
            // compile this cpp file into a .o file
-            info!("compiling main.cpp");
+            trace!("compiling main.cpp");
            let output = cpp_compiler
                .compile_object_file("main.cpp", "intrinsic-test-programs.o")
                .unwrap();
@ -118,7 +126,7 @@ pub trait SupportedArchitectureTest {
    fn build_rust_file(&self) -> bool {
        std::fs::create_dir_all("rust_programs/src").unwrap();

-        let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len());
+        let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400);

        let mut cargo = File::create("rust_programs/Cargo.toml").unwrap();
        write_bin_cargo_toml(&mut cargo, chunk_count).unwrap();
@ -188,9 +196,13 @@ pub trait SupportedArchitectureTest {
    }
 }

-pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) {
-    let available_parallelism = std::thread::available_parallelism().unwrap().get();
-    let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count));
+// pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) {
+//     let available_parallelism = std::thread::available_parallelism().unwrap().get();
+//     let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count));

+//     (chunk_size, intrinsic_count.div_ceil(chunk_size))
+// }
+
+pub fn manual_chunk(intrinsic_count: usize, chunk_size: usize) -> (usize, usize) {
    (chunk_size, intrinsic_count.div_ceil(chunk_size))
 }
--- a/library/stdarch/crates/intrinsic-test/src/common/values.rs
+++ b/library/stdarch/crates/intrinsic-test/src/common/values.rs
@ -4,6 +4,13 @@
 pub fn value_for_array(bits: u32, index: u32) -> u64 {
    let index = index as usize;
    match bits {
+        1 => VALUES_8[index % 2].into(),
+        2 => VALUES_8[index % 4].into(),
+        3 => VALUES_8[index % 8].into(),
+        4 => VALUES_8[index % 16].into(),
+        5 => VALUES_5[index % VALUES_5.len()].into(),
+        6 => VALUES_6[index % VALUES_6.len()].into(),
+        7 => VALUES_7[index % VALUES_7.len()].into(),
        8 => VALUES_8[index % VALUES_8.len()].into(),
        16 => VALUES_16[index % VALUES_16.len()].into(),
        32 => VALUES_32[index % VALUES_32.len()].into(),
@ -12,6 +19,24 @@ pub fn value_for_array(bits: u32, index: u32) -> u64 {
    }
 }

+pub const VALUES_5: &[u8] = &[
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x019, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e,
+    0x1f,
+];
+
+pub const VALUES_6: &[u8] = &[
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x039, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
+    0x3f,
+];
+
+pub const VALUES_7: &[u8] = &[
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x079, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e,
+    0x7f,
+];
+
 pub const VALUES_8: &[u8] = &[
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0xf0, 0x80, 0x3b, 0xff,
--- a/library/stdarch/crates/intrinsic-test/src/main.rs
+++ b/library/stdarch/crates/intrinsic-test/src/main.rs
@ -3,10 +3,12 @@ extern crate log;

 mod arm;
 mod common;
+mod x86;

 use arm::ArmArchitectureTest;
 use common::SupportedArchitectureTest;
 use common::cli::{Cli, ProcessedCli};
+use x86::X86ArchitectureTest;

 fn main() {
    pretty_env_logger::init();
@ -18,6 +20,7 @@ fn main() {
        | "armv7-unknown-linux-gnueabihf"
        | "aarch64_be-unknown-linux-gnu" => run(ArmArchitectureTest::create(processed_cli_options)),

+        "x86_64-unknown-linux-gnu" => run(X86ArchitectureTest::create(processed_cli_options)),
        _ => std::process::exit(0),
    }
 }
--- a/library/stdarch/crates/intrinsic-test/src/x86/compile.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/compile.rs
@ -0,0 +1,47 @@
+use crate::common::cli::ProcessedCli;
+use crate::common::compile_c::{CompilationCommandBuilder, CppCompilation};
+
+pub fn build_cpp_compilation(config: &ProcessedCli) -> Option<CppCompilation> {
+    let cpp_compiler = config.cpp_compiler.as_ref()?;
+
+    // -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations
+    let mut command = CompilationCommandBuilder::new()
+        .add_arch_flags(["icelake-client"])
+        .set_compiler(cpp_compiler)
+        .set_target(&config.target)
+        .set_opt_level("2")
+        .set_cxx_toolchain_dir(config.cxx_toolchain_dir.as_deref())
+        .set_project_root("c_programs")
+        .add_extra_flags(vec![
+            "-ffp-contract=off",
+            "-Wno-narrowing",
+            "-mavx",
+            "-mavx2",
+            "-mavx512f",
+            "-msse2",
+            "-mavx512vl",
+            "-mavx512bw",
+            "-mavx512dq",
+            "-mavx512cd",
+            "-mavx512fp16",
+            "-msha512",
+            "-msm4",
+            "-mavxvnni",
+            "-mavx512bitalg",
+            "-mavx512ifma",
+            "-mavx512vbmi",
+            "-mavx512vbmi2",
+            "-mavx512vnni",
+            "-mavx512vpopcntdq",
+            "-ferror-limit=1000",
+            "-std=c++23",
+        ]);
+
+    if !cpp_compiler.contains("clang") {
+        command = command.add_extra_flag("-flax-vector-conversions");
+    }
+
+    let cpp_compiler = command.into_cpp_compilation();
+
+    Some(cpp_compiler)
+}
--- a/library/stdarch/crates/intrinsic-test/src/x86/config.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/config.rs
@ -0,0 +1,409 @@
+pub const NOTICE: &str = "\
+// This is a transient test file, not intended for distribution. Some aspects of the
+// test are derived from an XML specification, published under the same license as the
+// `intrinsic-test` crate.\n";
+
+// Format f16 values (and vectors containing them) in a way that is consistent with C.
+pub const PLATFORM_RUST_DEFINITIONS: &str = r#"
+use std::arch::x86_64::*;
+
+#[inline]
+unsafe fn _mm_loadu_ph_to___m128i(mem_addr: *const f16) -> __m128i {
+    _mm_castph_si128(_mm_loadu_ph(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_ph_to___m256i(mem_addr: *const f16) -> __m256i {
+    _mm256_castph_si256(_mm256_loadu_ph(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_ph_to___mm512i(mem_addr: *const f16) -> __m512i {
+    _mm512_castph_si512(_mm512_loadu_ph(mem_addr))
+}
+
+
+#[inline]
+unsafe fn _mm_loadu_ps_to___m128h(mem_addr: *const f32) -> __m128h {
+    _mm_castps_ph(_mm_loadu_ps(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_ps_to___m256h(mem_addr: *const f32) -> __m256h {
+    _mm256_castps_ph(_mm256_loadu_ps(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_ps_to___m512h(mem_addr: *const f32) -> __m512h {
+    _mm512_castps_ph(_mm512_loadu_ps(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm_loadu_epi16_to___m128d(mem_addr: *const i16) -> __m128d {
+    _mm_castsi128_pd(_mm_loadu_epi16(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_epi16_to___m256d(mem_addr: *const i16) -> __m256d {
+    _mm256_castsi256_pd(_mm256_loadu_epi16(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_epi16_to___m512d(mem_addr: *const i16) -> __m512d {
+    _mm512_castsi512_pd(_mm512_loadu_epi16(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm_loadu_epi32_to___m128d(mem_addr: *const i32) -> __m128d {
+    _mm_castsi128_pd(_mm_loadu_epi32(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_epi32_to___m256d(mem_addr: *const i32) -> __m256d {
+    _mm256_castsi256_pd(_mm256_loadu_epi32(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_epi32_to___m512d(mem_addr: *const i32) -> __m512d {
+    _mm512_castsi512_pd(_mm512_loadu_epi32(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm_loadu_epi64_to___m128d(mem_addr: *const i64) -> __m128d {
+    _mm_castsi128_pd(_mm_loadu_epi64(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_epi64_to___m256d(mem_addr: *const i64) -> __m256d {
+    _mm256_castsi256_pd(_mm256_loadu_epi64(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_epi64_to___m512d(mem_addr: *const i64) -> __m512d {
+    _mm512_castsi512_pd(_mm512_loadu_epi64(mem_addr))
+}
+
+// === 
+#[inline]
+unsafe fn _mm_loadu_epi16_to___m128(mem_addr: *const i16) -> __m128 {
+    _mm_castsi128_ps(_mm_loadu_epi16(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_epi16_to___m256(mem_addr: *const i16) -> __m256 {
+    _mm256_castsi256_ps(_mm256_loadu_epi16(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_epi16_to___m512(mem_addr: *const i16) -> __m512 {
+    _mm512_castsi512_ps(_mm512_loadu_epi16(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm_loadu_epi32_to___m128(mem_addr: *const i32) -> __m128 {
+    _mm_castsi128_ps(_mm_loadu_epi32(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_epi32_to___m256(mem_addr: *const i32) -> __m256 {
+    _mm256_castsi256_ps(_mm256_loadu_epi32(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_epi32_to___m512(mem_addr: *const i32) -> __m512 {
+    _mm512_castsi512_ps(_mm512_loadu_epi32(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm_loadu_epi64_to___m128(mem_addr: *const i64) -> __m128 {
+    _mm_castsi128_ps(_mm_loadu_epi64(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm256_loadu_epi64_to___m256(mem_addr: *const i64) -> __m256 {
+    _mm256_castsi256_ps(_mm256_loadu_epi64(mem_addr))
+}
+
+#[inline]
+unsafe fn _mm512_loadu_epi64_to___m512(mem_addr: *const i64) -> __m512 {
+    _mm512_castsi512_ps(_mm512_loadu_epi64(mem_addr))
+}
+
+#[inline]
+fn debug_simd_finish<T: core::fmt::Debug, const N: usize>(
+    formatter: &mut core::fmt::Formatter<'_>,
+    type_name: &str,
+    array: &[T; N],
+) -> core::fmt::Result {
+    core::fmt::Formatter::debug_tuple_fields_finish(
+        formatter,
+        type_name,
+        &core::array::from_fn::<&dyn core::fmt::Debug, N, _>(|i| &array[i]),
+    )
+}
+
+#[repr(transparent)]
+struct Hex<T>(T);
+
+impl<T: DebugHexF16> core::fmt::Debug for Hex<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        <T as DebugHexF16>::fmt(&self.0, f)
+    }
+}
+
+fn debug_f16<T: DebugHexF16>(x: T) -> impl core::fmt::Debug {
+    Hex(x)
+}
+
+trait DebugHexF16 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result;
+}
+
+impl DebugHexF16 for f16 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{:#06x?}", self.to_bits())
+    }
+}
+
+impl DebugHexF16 for __m128h {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 8]>(*self) };
+        debug_simd_finish(f, "__m128h", &array)
+    }
+}
+
+impl DebugHexF16 for __m128i {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 8]>(*self) };
+        debug_simd_finish(f, "__m128i", &array)
+    }
+}
+
+impl DebugHexF16 for __m256h {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 16]>(*self) };
+        debug_simd_finish(f, "__m256h", &array)
+    }
+}
+
+impl DebugHexF16 for __m256i {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 16]>(*self) };
+        debug_simd_finish(f, "__m256i", &array)
+    }
+}
+
+impl DebugHexF16 for __m512h {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 32]>(*self) };
+        debug_simd_finish(f, "__m512h", &array)
+    }
+}
+
+impl DebugHexF16 for __m512i {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 32]>(*self) };
+        debug_simd_finish(f, "__m512i", &array)
+    }
+}
+
+trait DebugAs<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result;
+}
+
+impl<T: core::fmt::Display> DebugAs<T> for T {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{self}")
+    }
+}
+
+macro_rules! impl_debug_as {
+    ($simd:ty, $name:expr, $bits:expr, [$($type:ty),+]) => {
+        $(
+            impl DebugAs<$type> for $simd {
+                fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+                    const ELEMENT_BITS: usize = core::mem::size_of::<$type>() * 8;
+                    const NUM_ELEMENTS: usize = $bits / ELEMENT_BITS;
+                    let array = unsafe { core::mem::transmute::<_, [$type; NUM_ELEMENTS]>(*self) };
+                    debug_simd_finish(f, $name, &array)
+                }
+            }
+        )+
+    };
+}
+
+impl_debug_as!(__m128i, "__m128i", 128, [u8, i8, u16, i16, u32, i32, u64, i64]);
+impl_debug_as!(__m256i, "__m256i", 256, [u8, i8, u16, i16, u32, i32, u64, i64]);
+impl_debug_as!(__m512i, "__m512i", 512, [u8, i8, u16, i16, u32, i32, u64, i64]);
+impl_debug_as!(__m128h, "__m128h", 128, [f32]);
+impl_debug_as!(__m256h, "__m256h", 256, [f32]);
+impl_debug_as!(__m512h, "__m512h", 512, [f32]);
+
+fn debug_as<V, T>(x: V) -> impl core::fmt::Debug 
+where V: DebugAs<T>
+{
+    struct DebugWrapper<V, T>(V, core::marker::PhantomData<T>);
+    impl<V: DebugAs<T>, T> core::fmt::Debug for DebugWrapper<V, T> {
+        fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+            self.0.fmt(f)
+        }
+    }
+    DebugWrapper(x, core::marker::PhantomData)
+}
+
+"#;
+
+pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#"
+#ifndef X86_DECLARATIONS
+#define X86_DECLARATIONS
+    typedef _Float16 float16_t;
+    typedef float float32_t;
+    typedef double float64_t;
+    
+    #define __int64 long long
+    #define __int32 int
+
+    std::ostream& operator<<(std::ostream& os, _Float16 value);
+    std::ostream& operator<<(std::ostream& os, __m128i value);
+    std::ostream& operator<<(std::ostream& os, __m256i value);
+    std::ostream& operator<<(std::ostream& os, __m512i value);
+    std::ostream& operator<<(std::ostream& os, __mmask8 value);
+    
+    #define _mm512_extract_intrinsic_test_epi8(m, lane) \
+        _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16)
+    
+    #define _mm512_extract_intrinsic_test_epi16(m, lane) \
+        _mm_extract_epi16(_mm512_extracti64x2_epi64((m), (lane) / 8), (lane) % 8)
+    
+    #define _mm512_extract_intrinsic_test_epi32(m, lane) \
+        _mm_extract_epi32(_mm512_extracti64x2_epi64((m), (lane) / 4), (lane) % 4)
+    
+    #define _mm512_extract_intrinsic_test_epi64(m, lane) \
+        _mm_extract_epi64(_mm512_extracti64x2_epi64((m), (lane) / 2), (lane) % 2)
+    
+    #define _mm64_extract_intrinsic_test_epi8(m, lane) \
+        ((_mm_extract_pi16((m), (lane) / 2) >> (((lane) % 2) * 8)) & 0xFF)
+    
+    #define _mm64_extract_intrinsic_test_epi32(m, lane) \
+        _mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32))
+        
+    // Load f16 (__m128h) and cast to integer (__m128i)
+    #define _mm_loadu_ph_to___m128i(mem_addr) _mm_castph_si128(_mm_loadu_ph(mem_addr))
+    #define _mm256_loadu_ph_to___m256i(mem_addr) _mm256_castph_si256(_mm256_loadu_ph(mem_addr))
+    #define _mm512_loadu_ph_to___m512i(mem_addr) _mm512_castph_si512(_mm512_loadu_ph(mem_addr))
+    
+    // Load f32 (__m128) and cast to f16 (__m128h)
+    #define _mm_loadu_ps_to___m128h(mem_addr) _mm_castps_ph(_mm_loadu_ps(mem_addr))
+    #define _mm256_loadu_ps_to___m256h(mem_addr) _mm256_castps_ph(_mm256_loadu_ps(mem_addr))
+    #define _mm512_loadu_ps_to___m512h(mem_addr) _mm512_castps_ph(_mm512_loadu_ps(mem_addr))
+    
+    // Load integer types and cast to double (__m128d, __m256d, __m512d)
+    #define _mm_loadu_epi16_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr)))
+    #define _mm256_loadu_epi16_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr)))
+    #define _mm512_loadu_epi16_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr)))
+    
+    #define _mm_loadu_epi32_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr)))
+    #define _mm256_loadu_epi32_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr)))
+    #define _mm512_loadu_epi32_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr)))
+    
+    #define _mm_loadu_epi64_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr)))
+    #define _mm256_loadu_epi64_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr)))
+    #define _mm512_loadu_epi64_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr)))
+    
+    // Load integer types and cast to float (__m128, __m256, __m512)
+    #define _mm_loadu_epi16_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr)))
+    #define _mm256_loadu_epi16_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr)))
+    #define _mm512_loadu_epi16_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr)))
+    
+    #define _mm_loadu_epi32_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr)))
+    #define _mm256_loadu_epi32_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr)))
+    #define _mm512_loadu_epi32_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr)))
+    
+    #define _mm_loadu_epi64_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr)))
+    #define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr)))
+    #define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr)))
+    
+    // T1 is the `To` type, T2 is the `From` type
+    template<typename T1, typename T2> T1 cast(T2 x) {
+      if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2>) || (std::is_floating_point_v<T1> && std::is_floating_point_v<T2>)) {
+          return x;
+      } else if constexpr (sizeof(T1) <= sizeof(T2)) {
+        T1 ret{};
+        std::memcpy(&ret, &x, sizeof(T1));
+        return ret;
+      } else {
+        static_assert(sizeof(T1) == sizeof(T2) || std::is_convertible_v<T2, T1>,
+                              "T2 must either be convertible to T1, or have the same size as T1!");
+        return T1{};
+      }
+    }
+#endif
+"#;
+pub const PLATFORM_C_DEFINITIONS: &str = r#"
+
+std::ostream& operator<<(std::ostream& os, _Float16 value) {
+    uint16_t temp = 0;
+    memcpy(&temp, &value, sizeof(_Float16));
+    std::stringstream ss;
+    ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
+    os << ss.str();
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, __m128i value) {
+    void* temp = malloc(sizeof(__m128i));
+    _mm_storeu_si128((__m128i*)temp, value);
+    std::stringstream ss;
+    
+    ss << "0x";
+    for(int i = 0; i < 16; i++) {
+        ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i];
+    }
+    os << ss.str();
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, __m256i value) {
+    void* temp = malloc(sizeof(__m256i));
+    _mm256_storeu_si256((__m256i*)temp, value);
+    std::stringstream ss;
+    
+    ss << "0x";
+    for(int i = 0; i < 32; i++) {
+        ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i];
+    }
+    os << ss.str();
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, __m512i value) {
+    void* temp = malloc(sizeof(__m512i));
+    _mm512_storeu_si512((__m512i*)temp, value);
+    std::stringstream ss;
+    
+    ss << "0x";
+    for(int i = 0; i < 64; i++) {
+        ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i];
+    }
+    os << ss.str();
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, __mmask8 value) {
+    os << static_cast<int>(value);
+    return os;
+}
+"#;
+
+pub const PLATFORM_RUST_CFGS: &str = r#"
+#![cfg_attr(target_arch = "x86", feature(avx))]
+#![cfg_attr(target_arch = "x86", feature(sse))]
+#![cfg_attr(target_arch = "x86", feature(sse2))]
+#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_bf16))]
+#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))]
+#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))]
+#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))]
+#![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))]
+#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))]
+#![feature(fmt_helpers_for_derive)]
+"#;
--- a/library/stdarch/crates/intrinsic-test/src/x86/constraint.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/constraint.rs
@ -0,0 +1,30 @@
+use crate::common::constraint::Constraint;
+
+pub fn map_constraints(imm_type: &String, imm_width: u32) -> Option<Constraint> {
+    if imm_width > 0 {
+        let max: i64 = 2i64.pow(imm_width);
+        return Some(Constraint::Range(0..max));
+    }
+    match imm_type.as_str() {
+        // Legal values for variables of `_MM_FROUND` type are:
+        // 8 =>  (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+        // 9 =>  (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+        // 10 => (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+        // 11 => (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+        // 4 =>   _MM_FROUND_CUR_DIRECTION                      // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+        "_MM_FROUND" => Some(Constraint::Set(vec![4, 8, 9, 10, 11])),
+        "_MM_INDEX_SCALE" => Some(Constraint::Set(vec![1, 2, 4, 8])),
+        "_MM_CMPINT" => Some(Constraint::Range(0..8)),
+        "_MM_REDUCE" => Some(Constraint::Range(0..8)),
+        "_MM_FROUND_SAE" => Some(Constraint::Equal(8)),
+        "_MM_MANTISSA_NORM" => Some(Constraint::Range(0..4)),
+        "_MM_MANTISSA_NORM_ENUM" => Some(Constraint::Range(0..4)),
+        "_MM_MANTISSA_SIGN" => Some(Constraint::Range(0..3)),
+        "_MM_PERM" => Some(Constraint::Range(0..256)),
+        "_MM_PERM_ENUM" => Some(Constraint::Range(0..256)),
+        "_MM_CMPINT_ENUM" => Some(Constraint::Range(0..8)),
+        "_MM_ROUND_MODE" => Some(Constraint::Set(vec![0, 0x2, 0x4, 0x6])),
+        "_CMP_" => Some(Constraint::Range(0..32)),
+        _ => None,
+    }
+}
--- a/library/stdarch/crates/intrinsic-test/src/x86/intrinsic.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/intrinsic.rs
@ -0,0 +1,23 @@
+use crate::common::intrinsic_helpers::IntrinsicType;
+use crate::x86::xml_parser::Parameter;
+use std::ops::{Deref, DerefMut};
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct X86IntrinsicType {
+    pub data: IntrinsicType,
+    pub param: Parameter,
+}
+
+impl Deref for X86IntrinsicType {
+    type Target = IntrinsicType;
+
+    fn deref(&self) -> &Self::Target {
+        &self.data
+    }
+}
+
+impl DerefMut for X86IntrinsicType {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.data
+    }
+}
--- a/library/stdarch/crates/intrinsic-test/src/x86/mod.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/mod.rs
@ -0,0 +1,76 @@
+mod compile;
+mod config;
+mod constraint;
+mod intrinsic;
+mod types;
+mod xml_parser;
+
+use crate::common::SupportedArchitectureTest;
+use crate::common::cli::ProcessedCli;
+use crate::common::compile_c::CppCompilation;
+use crate::common::intrinsic::Intrinsic;
+use crate::common::intrinsic_helpers::TypeKind;
+use intrinsic::X86IntrinsicType;
+use itertools::Itertools;
+use xml_parser::get_xml_intrinsics;
+
+pub struct X86ArchitectureTest {
+    intrinsics: Vec<Intrinsic<X86IntrinsicType>>,
+    cli_options: ProcessedCli,
+}
+
+impl SupportedArchitectureTest for X86ArchitectureTest {
+    type IntrinsicImpl = X86IntrinsicType;
+
+    fn cli_options(&self) -> &ProcessedCli {
+        &self.cli_options
+    }
+
+    fn intrinsics(&self) -> &[Intrinsic<X86IntrinsicType>] {
+        &self.intrinsics
+    }
+
+    fn cpp_compilation(&self) -> Option<CppCompilation> {
+        compile::build_cpp_compilation(&self.cli_options)
+    }
+
+    const NOTICE: &str = config::NOTICE;
+
+    const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h", "cstddef", "cstdint"];
+    const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS;
+    const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS;
+
+    const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS;
+    const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS;
+
+    fn create(cli_options: ProcessedCli) -> Self {
+        let intrinsics =
+            get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file");
+
+        let sample_percentage: usize = cli_options.sample_percentage as usize;
+
+        let mut intrinsics = intrinsics
+            .into_iter()
+            // Not sure how we would compare intrinsic that returns void.
+            .filter(|i| i.results.kind() != TypeKind::Void)
+            .filter(|i| i.results.kind() != TypeKind::BFloat)
+            .filter(|i| i.arguments.args.len() > 0)
+            .filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat))
+            // Skip pointers for now, we would probably need to look at the return
+            // type to work out how many elements we need to point to.
+            .filter(|i| !i.arguments.iter().any(|a| a.is_ptr()))
+            .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128))
+            .filter(|i| !cli_options.skip.contains(&i.name))
+            .unique_by(|i| i.name.clone())
+            .collect::<Vec<_>>();
+
+        let sample_size = (intrinsics.len() * sample_percentage) / 100;
+        intrinsics.truncate(sample_size);
+
+        intrinsics.sort_by(|a, b| a.name.cmp(&b.name));
+        Self {
+            intrinsics: intrinsics,
+            cli_options: cli_options,
+        }
+    }
+}
--- a/library/stdarch/crates/intrinsic-test/src/x86/types.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/types.rs
@ -0,0 +1,485 @@
+use std::str::FromStr;
+
+use itertools::Itertools;
+use regex::Regex;
+
+use super::intrinsic::X86IntrinsicType;
+use crate::common::cli::Language;
+use crate::common::indentation::Indentation;
+use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind};
+use crate::x86::xml_parser::Parameter;
+
+impl IntrinsicTypeDefinition for X86IntrinsicType {
+    /// Gets a string containing the type in C format.
+    /// This function assumes that this value is present in the metadata hashmap.
+    fn c_type(&self) -> String {
+        self.param
+            .type_data
+            .replace("unsigned __int64", "uint64_t")
+            .replace("unsigned __int32", "uint32_t")
+            .replace("unsigned __int16", "uint16_t")
+            .replace("unsigned __int8", "uint8_t")
+            .replace("__int64", "int64_t")
+            .replace("__int32", "int32_t")
+            .replace("__int16", "int16_t")
+            .replace("__int8", "int8_t")
+            .replace("const ", "")
+    }
+
+    fn c_single_vector_type(&self) -> String {
+        // matches __m128, __m256 and similar types
+        let re = Regex::new(r"__m\d+").unwrap();
+        if re.is_match(self.param.type_data.as_str()) {
+            self.param.type_data.clone()
+        } else {
+            unreachable!("Shouldn't be called on this type")
+        }
+    }
+
+    // fn rust_type(&self) -> String {
+    //     // handling edge cases first
+    //     // the general handling is implemented below
+    //     if let Some(val) = self.metadata.get("type") {
+    //         match val.as_str() {
+    //             "__m128 const *" => {
+    //                 return "&__m128".to_string();
+    //             }
+    //             "__m128d const *" => {
+    //                 return "&__m128d".to_string();
+    //             }
+    //             "const void*" => {
+    //                 return "&__m128d".to_string();
+    //             }
+    //             _ => {}
+    //         }
+    //     }
+
+    //     if self.kind() == TypeKind::Void && self.ptr {
+    //         // this has been handled by default settings in
+    //         // the from_param function of X86IntrinsicType
+    //         unreachable!()
+    //     }
+
+    //     // general handling cases
+    //     let core_part = if self.kind() == TypeKind::Mask {
+    //         // all types of __mmask<int> are handled here
+    //         format!("__mask{}", self.bit_len.unwrap())
+    //     } else if self.simd_len.is_some() {
+    //         // all types of __m<int> vector types are handled here
+    //         let re = Regex::new(r"\__m\d+[a-z]*").unwrap();
+    //         let rust_type = self
+    //             .metadata
+    //             .get("type")
+    //             .map(|val| re.find(val).unwrap().as_str());
+    //         rust_type.unwrap().to_string()
+    //     } else {
+    //         format!(
+    //             "{}{}",
+    //             self.kind.rust_prefix().to_string(),
+    //             self.bit_len.unwrap()
+    //         )
+    //     };
+
+    //     // extracting "memsize" so that even vector types can be involved
+    //     let memwidth = self
+    //         .metadata
+    //         .get("memwidth")
+    //         .map(|n| str::parse::<u32>(n).unwrap());
+    //     let prefix_part = if self.ptr && self.constant && self.bit_len.eq(&memwidth) {
+    //         "&"
+    //     } else if self.ptr && self.bit_len.eq(&memwidth) {
+    //         "&mut "
+    //     } else if self.ptr && self.constant {
+    //         "*const "
+    //     } else if self.ptr {
+    //         "*mut "
+    //     } else {
+    //         ""
+    //     };
+
+    //     return prefix_part.to_string() + core_part.as_str();
+    // }
+
+    /// Determines the load function for this type.
+    fn get_load_function(&self, _language: Language) -> String {
+        let type_value = self.param.type_data.clone();
+        if type_value.len() == 0 {
+            unimplemented!("the value for key 'type' is not present!");
+        }
+        if type_value.starts_with("__mmask") {
+            // no need of loads, since they work directly
+            // with hex constants
+            String::from("*")
+        } else if type_value.starts_with("__m") {
+            // the structure is like the follows:
+            // if "type" starts with __m<num>{h/i/<null>},
+            // then use either _mm_set1_epi64,
+            // _mm256_set1_epi64 or _mm512_set1_epi64
+            if type_value.contains("__m64") {
+                return String::from("*(__m64*)");
+            }
+
+            let type_val_filtered = type_value
+                .chars()
+                .filter(|c| c.is_numeric())
+                .join("")
+                .replace("128", "")
+                .replace("64", "");
+            {
+                let suffix = match (self.bit_len, self.kind) {
+                    (Some(16), TypeKind::Float)
+                        if ["__m128i", "__m256i", "__m512i"]
+                            .contains(&self.param.type_data.as_str()) =>
+                    {
+                        format!("ph_to_{}", self.param.type_data)
+                    }
+                    (Some(32), TypeKind::Float)
+                        if ["__m128h", "__m256h", "__m512h"]
+                            .contains(&self.param.type_data.as_str()) =>
+                    {
+                        format!("ps_to_{}", self.param.type_data)
+                    }
+                    (Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask)
+                        if ["__m128d", "__m256d", "__m512d"]
+                            .contains(&self.param.type_data.as_str()) =>
+                    {
+                        format!("epi{bit_len}_to_{}", self.param.type_data)
+                    }
+                    (Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask)
+                        if ["__m128", "__m256", "__m512"]
+                            .contains(&self.param.type_data.as_str()) =>
+                    {
+                        format!("epi{bit_len}_to_{}", self.param.type_data)
+                    }
+                    (Some(bit_len @ (8 | 16 | 32 | 64)), TypeKind::Int(_)) => {
+                        format!("epi{bit_len}")
+                    }
+                    (Some(bit_len), TypeKind::Mask) => format!("epi{bit_len}"),
+                    (Some(16), TypeKind::Float) => format!("ph"),
+                    (Some(32), TypeKind::Float) => format!("ps"),
+                    (Some(64), TypeKind::Float) => format!("pd"),
+                    (Some(128 | 256 | 512), TypeKind::Vector) => format!("epi32"),
+                    _ => unreachable!("Invalid element type for a vector type! {:?}", self.param),
+                };
+                format!("_mm{type_val_filtered}_loadu_{suffix}")
+            }
+        } else {
+            // if it is a pointer, then rely on type conversion
+            // If it is not any of the above type (__int<num>, __bfloat16, unsigned short, etc)
+            // then typecast it.
+            format!("({type_value})")
+        }
+    }
+
+    /// Generates a std::cout for the intrinsics results that will match the
+    /// rust debug output format for the return type. The generated line assumes
+    /// there is an int i in scope which is the current pass number.
+    fn print_result_c(&self, indentation: Indentation, additional: &str) -> String {
+        let lanes = if self.num_vectors() > 1 {
+            (0..self.num_vectors())
+                .map(|vector| {
+                    format!(
+                        r#""{ty}(" << {lanes} << ")""#,
+                        ty = self.c_single_vector_type(),
+                        lanes = (0..self.num_lanes())
+                            .map(move |idx| -> std::string::String {
+                                format!(
+                                    "{cast}{lane_fn}(__return_value.val[{vector}], {lane})",
+                                    cast = self.generate_final_type_cast(),
+                                    lane_fn = self.get_lane_function(),
+                                    lane = idx,
+                                    vector = vector,
+                                )
+                            })
+                            .collect::<Vec<_>>()
+                            .join(r#" << ", " << "#)
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join(r#" << ", " << "#)
+        } else if self.num_lanes() > 1 {
+            (0..self.num_lanes())
+                .map(|idx| -> std::string::String {
+                    let cast_type = self.c_promotion();
+                    let lane_fn = self.get_lane_function();
+                    if cast_type.len() > 2 {
+                        format!("cast<{cast_type}>({lane_fn}(__return_value, {idx}))")
+                    } else {
+                        format!("{lane_fn}(__return_value, {idx})")
+                    }
+                })
+                .collect::<Vec<_>>()
+                .join(r#" << ", " << "#)
+        } else {
+            format!(
+                "{promote}cast<{cast}>(__return_value)",
+                cast = match self.kind() {
+                    TypeKind::Void => "void".to_string(),
+                    TypeKind::Float if self.inner_size() == 64 => "double".to_string(),
+                    TypeKind::Float if self.inner_size() == 32 => "float".to_string(),
+                    TypeKind::Mask => format!(
+                        "__mmask{}",
+                        self.bit_len.expect(format!("self: {self:#?}").as_str())
+                    ),
+                    TypeKind::Vector => format!(
+                        "__m{}i",
+                        self.bit_len.expect(format!("self: {self:#?}").as_str())
+                    ),
+                    _ => self.c_scalar_type(),
+                },
+                promote = self.generate_final_type_cast(),
+            )
+        };
+
+        format!(
+            r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) <<  {lanes} << "{close}" << std::endl;"#,
+            ty = if self.is_simd() {
+                format!("{}(", self.c_type())
+            } else {
+                String::from("")
+            },
+            close = if self.is_simd() { ")" } else { "" },
+        )
+    }
+
+    /// Determines the get lane function for this type.
+    fn get_lane_function(&self) -> String {
+        let total_vector_bits: Option<u32> = self
+            .simd_len
+            .zip(self.bit_len)
+            .and_then(|(simd_len, bit_len)| Some(simd_len * bit_len));
+
+        match (self.bit_len, total_vector_bits) {
+            (Some(8), Some(128)) => String::from("(uint8_t)_mm_extract_epi8"),
+            (Some(16), Some(128)) => String::from("(uint16_t)_mm_extract_epi16"),
+            (Some(32), Some(128)) => String::from("(uint32_t)_mm_extract_epi32"),
+            (Some(64), Some(128)) => String::from("(uint64_t)_mm_extract_epi64"),
+            (Some(8), Some(256)) => String::from("(uint8_t)_mm256_extract_epi8"),
+            (Some(16), Some(256)) => String::from("(uint16_t)_mm256_extract_epi16"),
+            (Some(32), Some(256)) => String::from("(uint32_t)_mm256_extract_epi32"),
+            (Some(64), Some(256)) => String::from("(uint64_t)_mm256_extract_epi64"),
+            (Some(8), Some(512)) => String::from("(uint8_t)_mm512_extract_intrinsic_test_epi8"),
+            (Some(16), Some(512)) => String::from("(uint16_t)_mm512_extract_intrinsic_test_epi16"),
+            (Some(32), Some(512)) => String::from("(uint32_t)_mm512_extract_intrinsic_test_epi32"),
+            (Some(64), Some(512)) => String::from("(uint64_t)_mm512_extract_intrinsic_test_epi64"),
+            (Some(8), Some(64)) => String::from("(uint8_t)_mm64_extract_intrinsic_test_epi8"),
+            (Some(16), Some(64)) => String::from("(uint16_t)_mm_extract_pi16"),
+            (Some(32), Some(64)) => String::from("(uint32_t)_mm64_extract_intrinsic_test_epi32"),
+            _ => unreachable!(
+                "invalid length for vector argument: {:?}, {:?}",
+                self.bit_len, self.simd_len
+            ),
+        }
+    }
+
+    fn rust_scalar_type(&self) -> String {
+        let prefix = match self.data.kind {
+            TypeKind::Mask => String::from("__mmask"),
+            TypeKind::Vector => String::from("i"),
+            _ => self.kind().rust_prefix().to_string(),
+        };
+
+        let bits = if self.inner_size() >= 128 {
+            32
+        } else {
+            self.inner_size()
+        };
+        format!("{prefix}{bits}")
+    }
+
+    fn print_result_rust(&self) -> String {
+        let return_value = match self.kind() {
+            TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)".to_string(),
+            TypeKind::Float
+                if self.inner_size() == 32
+                    && ["__m512h"].contains(&self.param.type_data.as_str()) =>
+            {
+                "debug_as::<_, f32>(__return_value)".to_string()
+            }
+            TypeKind::Int(_)
+                if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) =>
+            {
+                format!("debug_as::<_, u{}>(__return_value)", self.inner_size())
+            }
+            _ => "format_args!(\"{__return_value:.150?}\")".to_string(),
+        };
+
+        return_value
+    }
+}
+
+impl X86IntrinsicType {
+    fn from_c(s: &str) -> Result<IntrinsicType, String> {
+        let mut s_copy = s.to_string();
+        s_copy = s_copy
+            .replace("*", "")
+            .replace("_", "")
+            .replace("constexpr", "")
+            .replace("const", "")
+            .replace("literal", "");
+
+        let s_split = s_copy
+            .split(" ")
+            .filter_map(|s| if s.len() == 0 { None } else { Some(s) })
+            .last();
+
+        let s_split = s_split.map(|s| s.chars().filter(|c| !c.is_numeric()).join(""));
+
+        // TODO: make the unwrapping safe
+        let kind = TypeKind::from_str(s_split.unwrap().trim()).unwrap_or(TypeKind::Void);
+
+        let kind = if s.find("unsigned").is_some() {
+            match kind {
+                TypeKind::Int(_) => TypeKind::Int(Sign::Unsigned),
+                TypeKind::Char(_) => TypeKind::Char(Sign::Unsigned),
+                a => a,
+            }
+        } else {
+            kind
+        };
+
+        let ptr_constant = false;
+        let constant = s.matches("const").next().is_some();
+        let ptr = s.matches("*").next().is_some();
+
+        Ok(IntrinsicType {
+            ptr,
+            ptr_constant,
+            constant,
+            kind,
+            bit_len: None,
+            simd_len: None,
+            vec_len: None,
+        })
+    }
+
+    pub fn update_simd_len(&mut self) {
+        let mut type_processed = self.param.type_data.clone();
+        type_processed.retain(|c| c.is_numeric());
+
+        // check the param.type and extract numeric part if there are double
+        // underscores. divide this number with bit-len and set this as simd-len.
+        // Only __m<int> types can have a simd-len.
+        if self.param.type_data.contains("__m") && !self.param.type_data.contains("__mmask") {
+            self.data.simd_len = match str::parse::<u32>(type_processed.as_str()) {
+                // If bit_len is None, simd_len will be None.
+                // Else simd_len will be (num_bits / bit_len).
+                Ok(num_bits) => self
+                    .data
+                    .bit_len
+                    .and_then(|bit_len| Some(num_bits / bit_len)),
+                Err(_) => None,
+            };
+        }
+    }
+
+    pub fn from_param(param: &Parameter) -> Result<Self, String> {
+        match Self::from_c(param.type_data.as_str()) {
+            Err(message) => Err(message),
+            Ok(mut data) => {
+                // First correct the type of the parameter using param.etype.
+                // The assumption is that the parameter of type void may have param.type
+                // as "__m128i", "__mmask8" and the like.
+                if !param.etype.is_empty() {
+                    match TypeKind::from_str(param.etype.as_str()) {
+                        Ok(value) => {
+                            data.kind = value;
+                        }
+                        Err(_) => {}
+                    };
+                }
+
+                // check for param.etype.
+                // extract the numeric part and set as bit-len
+                // If param.etype is not present, guess the default bit-len
+
+                let mut etype_processed = param.etype.clone();
+                etype_processed.retain(|c| c.is_numeric());
+
+                let mut type_processed = param.type_data.clone();
+                type_processed.retain(|c| c.is_numeric());
+
+                match str::parse::<u32>(etype_processed.as_str()) {
+                    Ok(value) => data.bit_len = Some(value),
+                    Err(_) => {
+                        data.bit_len = match data.kind() {
+                            TypeKind::Char(_) => Some(8),
+                            TypeKind::BFloat => Some(16),
+                            TypeKind::Int(_) => Some(32),
+                            TypeKind::Float => Some(32),
+                            _ => None,
+                        };
+                    }
+                }
+
+                if param.type_data.contains("__mmask") {
+                    data.bit_len = str::parse::<u32>(type_processed.as_str()).ok();
+                }
+
+                if vec!["M512", "M256", "M128"].contains(&param.etype.as_str()) {
+                    match param.type_data.chars().last() {
+                        Some('i') => {
+                            data.kind = TypeKind::Int(Sign::Signed);
+                            data.bit_len = Some(32);
+                        }
+                        Some('h') => {
+                            data.kind = TypeKind::Float;
+                            data.bit_len = Some(16);
+                        }
+                        Some('d') => {
+                            data.kind = TypeKind::Float;
+                            data.bit_len = Some(64);
+                        }
+                        _ => (),
+                    }
+                }
+
+                // default settings for "void *" parameters
+                // often used by intrinsics to denote memory address or so.
+                if data.kind == TypeKind::Void && data.ptr {
+                    data.kind = TypeKind::Int(Sign::Unsigned);
+                    data.bit_len = Some(8);
+                }
+
+                // default settings for "void *" parameters
+                // often used by intrinsics to denote memory address or so.
+                if data.kind == TypeKind::Mask && data.bit_len.is_none() {
+                    data.bit_len = Some(32);
+                }
+
+                if param.etype == "IMM" || param.imm_width > 0 || param.imm_type.len() > 0 {
+                    data.kind = TypeKind::Int(Sign::Unsigned);
+                    data.constant = true;
+                }
+
+                // Rust defaults to signed variants, unless they are explicitly mentioned
+                // the `type` field are C++ types.
+                if data.kind == TypeKind::Int(Sign::Unsigned)
+                    && !(param.type_data.contains("unsigned") || param.type_data.contains("uint"))
+                {
+                    data.kind = TypeKind::Int(Sign::Signed)
+                }
+
+                // default settings for IMM parameters
+                if param.etype == "IMM" {
+                    data.bit_len = if param.imm_width > 0 {
+                        Some(param.imm_width)
+                    } else {
+                        Some(8)
+                    }
+                }
+
+                let mut result = X86IntrinsicType {
+                    data,
+                    param: param.clone(),
+                };
+
+                result.update_simd_len();
+                Ok(result)
+            }
+        }
+        // Tile types won't currently reach here, since the intrinsic that involve them
+        // often return "null" type. Such intrinsics are not tested in `intrinsic-test`
+        // currently and are filtered out at `mod.rs`.
+    }
+}
--- a/library/stdarch/crates/intrinsic-test/src/x86/xml_parser.rs
+++ b/library/stdarch/crates/intrinsic-test/src/x86/xml_parser.rs
@ -0,0 +1,139 @@
+use crate::common::argument::{Argument, ArgumentList};
+use crate::common::intrinsic::Intrinsic;
+use crate::common::intrinsic_helpers::TypeKind;
+use crate::x86::constraint::map_constraints;
+
+use regex::Regex;
+use serde::{Deserialize, Deserializer};
+use std::path::Path;
+
+use super::intrinsic::X86IntrinsicType;
+
+// Custom deserializer function to convert strings to u32
+fn string_to_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let s = String::deserialize(deserializer)?;
+    return s.as_str().parse::<u32>().or(Ok(0u32));
+}
+
+#[derive(Deserialize)]
+struct Data {
+    #[serde(rename = "intrinsic", default)]
+    intrinsics: Vec<XMLIntrinsic>,
+}
+
+#[derive(Deserialize)]
+struct XMLIntrinsic {
+    #[serde(rename = "return")]
+    pub return_data: Parameter,
+    #[serde(rename = "@name")]
+    pub name: String,
+    // #[serde(rename = "@tech")]
+    // tech: String,
+    #[serde(rename = "CPUID", default)]
+    cpuid: Vec<String>,
+    #[serde(rename = "parameter", default)]
+    parameters: Vec<Parameter>,
+}
+
+#[derive(Debug, PartialEq, Clone, Deserialize)]
+pub struct Parameter {
+    #[serde(rename = "@varname", default)]
+    pub var_name: String,
+    #[serde(rename = "@type", default)]
+    pub type_data: String,
+    #[serde(rename = "@etype", default)]
+    pub etype: String,
+    #[serde(rename = "@memwidth", default, deserialize_with = "string_to_u32")]
+    pub memwidth: u32,
+    #[serde(rename = "@immwidth", default, deserialize_with = "string_to_u32")]
+    pub imm_width: u32,
+    #[serde(rename = "@immtype", default)]
+    pub imm_type: String,
+}
+
+pub fn get_xml_intrinsics(
+    filename: &Path,
+) -> Result<Vec<Intrinsic<X86IntrinsicType>>, Box<dyn std::error::Error>> {
+    let file = std::fs::File::open(filename)?;
+    let reader = std::io::BufReader::new(file);
+    let data: Data =
+        quick_xml::de::from_reader(reader).expect("failed to deserialize the source XML file");
+
+    let parsed_intrinsics: Vec<Intrinsic<X86IntrinsicType>> = data
+        .intrinsics
+        .into_iter()
+        .filter_map(|intr| {
+            // Some(xml_to_intrinsic(intr, target).expect("Couldn't parse XML properly!"))
+            xml_to_intrinsic(intr).ok()
+        })
+        .collect();
+
+    Ok(parsed_intrinsics)
+}
+
+fn xml_to_intrinsic(
+    intr: XMLIntrinsic,
+) -> Result<Intrinsic<X86IntrinsicType>, Box<dyn std::error::Error>> {
+    let name = intr.name;
+    let result = X86IntrinsicType::from_param(&intr.return_data);
+    let args_check = intr.parameters.into_iter().enumerate().map(|(i, param)| {
+        let ty = X86IntrinsicType::from_param(&param);
+        if ty.is_err() {
+            None
+        } else {
+            let effective_imm_width = if name == "_mm_mpsadbw_epu8" && param.var_name == "imm8" {
+                3
+            } else {
+                param.imm_width
+            };
+            let constraint = map_constraints(&param.imm_type, effective_imm_width);
+            let arg = Argument::<X86IntrinsicType>::new(
+                i,
+                param.var_name.clone(),
+                ty.unwrap(),
+                constraint,
+            );
+            Some(arg)
+        }
+    });
+
+    let args = args_check.collect::<Vec<_>>();
+    if args.iter().any(|elem| elem.is_none()) {
+        return Err(Box::from("intrinsic isn't fully supported in this test!"));
+    }
+    let mut args = args
+        .into_iter()
+        .map(|e| e.unwrap())
+        .filter(|arg| arg.ty.ptr || arg.ty.kind != TypeKind::Void)
+        .collect::<Vec<_>>();
+
+    let mut args_test = args.iter();
+
+    // if one of the args has etype="MASK" and type="__m<int>d",
+    // then set the bit_len and simd_len accordingly
+    let re = Regex::new(r"__m\d+").unwrap();
+    let is_mask = |arg: &Argument<X86IntrinsicType>| arg.ty.param.etype.as_str() == "MASK";
+    let is_vector = |arg: &Argument<X86IntrinsicType>| re.is_match(arg.ty.param.type_data.as_str());
+    let pos = args_test.position(|arg| is_mask(arg) && is_vector(arg));
+    if let Some(index) = pos {
+        args[index].ty.bit_len = args[0].ty.bit_len;
+    }
+
+    args.iter_mut().for_each(|arg| arg.ty.update_simd_len());
+
+    let arguments = ArgumentList::<X86IntrinsicType> { args };
+
+    if let Err(message) = result {
+        return Err(Box::from(message));
+    }
+
+    Ok(Intrinsic {
+        name,
+        arguments,
+        results: result.unwrap(),
+        arch_tags: intr.cpuid,
+    })
+}
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@ -13,10 +13,6 @@ auto_llvm_sign_conversion: false
 neon-stable: &neon-stable
  FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]

-# #[cfg(not(target_arch = "arm"))]
-target-not-arm: &target-not-arm
-  FnCall: [cfg, [{ FnCall: [not, ['target_arch = "arm"']]}]]
-
 # #[cfg(not(target_arch = "arm64ec"))]
 target-not-arm64ec: &target-not-arm64ec
  FnCall: [cfg, [{ FnCall: [not, ['target_arch = "arm64ec"']]}]]
@ -63,6 +59,9 @@ neon-unstable-f16: &neon-unstable-f16
 neon-unstable-feat-lut: &neon-unstable-feat-lut
  FnCall: [unstable, ['feature = "stdarch_neon_feat_lut"', 'issue = "138050"']]

+aarch64-unstable-jscvt: &aarch64-unstable-jscvt
+  FnCall: [unstable, ['feature = "stdarch_aarch64_jscvt"', 'issue = "147555"']]
+
 # #[cfg(target_endian = "little")]
 little-endian: &little-endian
  FnCall: [cfg, ['target_endian = "little"']]
@ -8781,7 +8780,6 @@ intrinsics:
      - [float64x1_t, float32x2_t]
      - [float32x4_t, float64x2_t]
      - [float64x2_t, float32x4_t]
-    big_endian_inverse: false
    compose:
      - FnCall: [transmute, [a]]

@ -8802,7 +8800,6 @@ intrinsics:
      # q
      - [float64x2_t, float16x8_t]
      - [float16x8_t, float64x2_t]
-    big_endian_inverse: false
    compose:
      - FnCall: [transmute, [a]]

@ -13082,7 +13079,6 @@ intrinsics:
    return_type: "{type[0]}"
    attr:
      - FnCall: [target_feature, ['enable = "crc"']]
-      - *target-not-arm
      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32x"]] }]]
      - *aarch64-crc-stable
    safety: safe
@ -13104,7 +13100,6 @@ intrinsics:
    return_type: "{type[0]}"
    attr:
      - FnCall: [target_feature, ['enable = "crc"']]
-      - *target-not-arm
      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32cx"]] }]]
      - *aarch64-crc-stable
    safety: safe
@ -14267,3 +14262,21 @@ intrinsics:
                - 'vluti4q_laneq_{neon_type[5]}_x2::<LANE>'
                - - FnCall: [transmute, [a]]
                  - b
+
+  - name: "__jcvt"
+    doc: "Floating-point JavaScript convert to signed fixed-point, rounding toward zero"
+    arguments: ["a: {type}"]
+    return_type: "i32"
+    attr:
+      - FnCall: [target_feature, ['enable = "jsconv"']]
+      - FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["fjcvtzs"]] }]]
+      - *aarch64-unstable-jscvt
+    safety: safe
+    types:
+      - f64
+    compose:
+      - LLVMLink:
+          name: "fjcvtzs"
+          links:
+            - link: "llvm.aarch64.fjcvtzs"
+              arch: aarch64,arm64ec
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@ -8480,7 +8480,6 @@ intrinsics:
      - [poly16x8_t, p128]
      - [int8x16_t, p128]
      - [uint8x16_t, p128]
-    big_endian_inverse: false
    compose:
      - FnCall: [transmute, [a]]

@ -8718,7 +8717,6 @@ intrinsics:
      - [poly8x16_t, float32x4_t]
      - [poly16x8_t, float32x4_t]
      - [p128, float32x4_t]
-    big_endian_inverse: false
    compose:
      - FnCall: [transmute, [a]]

@ -8782,7 +8780,6 @@ intrinsics:
      - [float16x8_t, uint16x8_t]
      - [float16x8_t, uint32x4_t]
      - [float16x8_t, uint64x2_t]
-    big_endian_inverse: false
    compose:
      - FnCall: [transmute, [a]]

@ -8807,7 +8804,6 @@ intrinsics:
      - [poly128_t, float16x8_t]
      - [float16x8_t, poly128_t]
      - [float16x8_t, poly64x2_t]
-    big_endian_inverse: false
    compose:
      - FnCall: [transmute, [a]]

--- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@ -164,7 +164,7 @@ fn verify_all_signatures() {
    // Open up the network console and you'll see an xml file was downloaded
    // (currently called data-3.6.9.xml). That's the file we downloaded
    // here.
-    let xml = include_bytes!("../x86-intel.xml");
+    let xml = include_bytes!("../../../intrinsics_data/x86-intel.xml");

    let xml = &xml[..];
    let data: Data = quick_xml::de::from_reader(xml).expect("failed to deserialize xml");
--- a/library/stdarch/intrinsics_data/arm_intrinsics.json
+++ b/library/stdarch/intrinsics_data/arm_intrinsics.json
@ -119753,5 +119753,28 @@
        "LUTI4"
      ]
    ]
+  },
+  {
+    "SIMD_ISA": "Neon",
+    "name": "__jcvt",
+    "arguments": [
+      "float64_t a"
+    ],
+    "return_type": {
+      "value": "int32_t"
+    },
+    "Arguments_Preparation": {
+      "a": {
+        "register": "Dn"
+      }
+    },
+    "Architectures": [
+      "A64"
+    ],
+    "instructions": [
+      [
+        "FJCVTZS"
+      ]
+    ]
  }
 ]
--- a/library/stdarch/crates/stdarch-verify/x86-intel.xml
+++ b/library/stdarch/crates/stdarch-verify/x86-intel.xml
--- a/library/stdarch/rust-version
+++ b/library/stdarch/rust-version
@ -1 +1 @@
-32e7a4b92b109c24e9822c862a7c74436b50e564
+73e6c9ebd9123154a196300ef58e30ec8928e74e
--- a/src/tools/miri/src/shims/x86/avx.rs
+++ b/src/tools/miri/src/shims/x86/avx.rs
@ -217,50 +217,6 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                    )?;
                }
            }
-            // Used to implement the _mm256_permute2f128_ps, _mm256_permute2f128_pd and
-            // _mm256_permute2f128_si256 functions. Regardless of the suffix in the name
-            // thay all can be considered to operate on vectors of 128-bit elements.
-            // For each 128-bit element of `dest`, copies one from `left`, `right` or
-            // zero, according to `imm`.
-            "vperm2f128.ps.256" | "vperm2f128.pd.256" | "vperm2f128.si.256" => {
-                let [left, right, imm] =
-                    this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
-
-                assert_eq!(dest.layout, left.layout);
-                assert_eq!(dest.layout, right.layout);
-                assert_eq!(dest.layout.size.bits(), 256);
-
-                // Transmute to `[u128; 2]` to process each 128-bit chunk independently.
-                let u128x2_layout =
-                    this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u128, 2))?;
-                let left = left.transmute(u128x2_layout, this)?;
-                let right = right.transmute(u128x2_layout, this)?;
-                let dest = dest.transmute(u128x2_layout, this)?;
-
-                let imm = this.read_scalar(imm)?.to_u8()?;
-
-                for i in 0..2 {
-                    let dest = this.project_index(&dest, i)?;
-
-                    let imm = match i {
-                        0 => imm & 0xF,
-                        1 => imm >> 4,
-                        _ => unreachable!(),
-                    };
-                    if imm & 0b100 != 0 {
-                        this.write_scalar(Scalar::from_u128(0), &dest)?;
-                    } else {
-                        let src = match imm {
-                            0b00 => this.project_index(&left, 0)?,
-                            0b01 => this.project_index(&left, 1)?,
-                            0b10 => this.project_index(&right, 0)?,
-                            0b11 => this.project_index(&right, 1)?,
-                            _ => unreachable!(),
-                        };
-                        this.copy_op(&src, &dest)?;
-                    }
-                }
-            }
            // Used to implement the _mm_maskload_ps, _mm_maskload_pd, _mm256_maskload_ps
            // and _mm256_maskload_pd functions.
            // For the element `i`, if the high bit of the `i`-th element of `mask`
--- a/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-avx.rs
+++ b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-avx.rs
@ -829,15 +829,16 @@ unsafe fn test_avx() {

    #[target_feature(enable = "avx")]
    unsafe fn test_mm256_permute2f128_ps() {
-        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
-        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
-        let r = _mm256_permute2f128_ps::<0x13>(a, b);
-        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
+        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
+        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
+        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
+        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
        assert_eq_m256(r, e);

-        let r = _mm256_permute2f128_ps::<0x44>(a, b);
-        let e = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
-        assert_eq_m256(r, e);
+        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
+        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
+        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+        assert_eq_m256(r, z);
    }
    test_mm256_permute2f128_ps();

@ -845,11 +846,12 @@ unsafe fn test_avx() {
    unsafe fn test_mm256_permute2f128_pd() {
        let a = _mm256_setr_pd(1., 2., 3., 4.);
        let b = _mm256_setr_pd(5., 6., 7., 8.);
-        let r = _mm256_permute2f128_pd::<0x31>(a, b);
+        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
        let e = _mm256_setr_pd(3., 4., 7., 8.);
        assert_eq_m256d(r, e);

-        let r = _mm256_permute2f128_pd::<0x44>(a, b);
+        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
+        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
        assert_eq_m256d(r, e);
    }
@ -857,13 +859,14 @@ unsafe fn test_avx() {

    #[target_feature(enable = "avx")]
    unsafe fn test_mm256_permute2f128_si256() {
-        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
-        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
-        let r = _mm256_permute2f128_si256::<0x20>(a, b);
-        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
+        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
+        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
+        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
        assert_eq_m256i(r, e);

-        let r = _mm256_permute2f128_si256::<0x44>(a, b);
+        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
+        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }
--- a/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-pause-without-sse2.rs
+++ b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-pause-without-sse2.rs
@ -9,10 +9,5 @@ use std::arch::x86_64::*;

 fn main() {
    assert!(!is_x86_feature_detected!("sse2"));
-
-    unsafe {
-        // This is a SSE2 intrinsic, but it behaves as a no-op when SSE2
-        // is not available, so it is always safe to call.
-        _mm_pause();
-    }
+    _mm_pause();
 }
--- a/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-sse2.rs
+++ b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-sse2.rs
@ -54,7 +54,7 @@ unsafe fn test_sse2() {
    }

    fn test_mm_pause() {
-        unsafe { _mm_pause() }
+        _mm_pause()
    }
    test_mm_pause();

--- a/src/tools/miri/tests/pass/shims/x86/intrinsics-x86.rs
+++ b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86.rs
@ -7,17 +7,13 @@ mod x86 {

    fn adc(c_in: u8, a: u32, b: u32) -> (u8, u32) {
        let mut sum = 0;
-        // SAFETY: There are no safety requirements for calling `_addcarry_u32`.
-        // It's just unsafe for API consistency with other intrinsics.
-        let c_out = unsafe { arch::_addcarry_u32(c_in, a, b, &mut sum) };
+        let c_out = arch::_addcarry_u32(c_in, a, b, &mut sum);
        (c_out, sum)
    }

    fn sbb(b_in: u8, a: u32, b: u32) -> (u8, u32) {
        let mut sum = 0;
-        // SAFETY: There are no safety requirements for calling `_subborrow_u32`.
-        // It's just unsafe for API consistency with other intrinsics.
-        let b_out = unsafe { arch::_subborrow_u32(b_in, a, b, &mut sum) };
+        let b_out = arch::_subborrow_u32(b_in, a, b, &mut sum);
        (b_out, sum)
    }

@ -52,17 +48,13 @@ mod x86_64 {

    fn adc(c_in: u8, a: u64, b: u64) -> (u8, u64) {
        let mut sum = 0;
-        // SAFETY: There are no safety requirements for calling `_addcarry_u64`.
-        // It's just unsafe for API consistency with other intrinsics.
-        let c_out = unsafe { arch::_addcarry_u64(c_in, a, b, &mut sum) };
+        let c_out = arch::_addcarry_u64(c_in, a, b, &mut sum);
        (c_out, sum)
    }

    fn sbb(b_in: u8, a: u64, b: u64) -> (u8, u64) {
        let mut sum = 0;
-        // SAFETY: There are no safety requirements for calling `_subborrow_u64`.
-        // It's just unsafe for API consistency with other intrinsics.
-        let b_out = unsafe { arch::_subborrow_u64(b_in, a, b, &mut sum) };
+        let b_out = arch::_subborrow_u64(b_in, a, b, &mut sum);
        (b_out, sum)
    }