Auto merge of #148425 - matthiaskrgr:rollup-pwibmxt, r=matthiaskrgr

Rollup of 3 pull requests

Successful merges:

 - rust-lang/rust#146260 (add SliceIndex wrapper types Last and Clamp<Idx>)
 - rust-lang/rust#148394 (Make explicit that `TypeId`'s layout and size are unstable)
 - rust-lang/rust#148402 (stdarch subtree update)

r? `@ghost`
`@rustbot` modify labels: rollup
This commit is contained in:
bors 2025-11-03 06:36:43 +00:00
commit f2bae990e8
82 changed files with 13750 additions and 1086 deletions

View file

@ -611,6 +611,15 @@ impl dyn Any + Send + Sync {
/// noting that the hashes and ordering will vary between Rust releases. Beware
/// of relying on them inside of your code!
///
/// # Layout
///
/// Like other [`Rust`-representation][repr-rust] types, `TypeId`'s size and layout are unstable.
/// In particular, this means that you cannot rely on the size and layout of `TypeId` remaining the
/// same between Rust releases; they are subject to change without prior notice between Rust
/// releases.
///
/// [repr-rust]: https://doc.rust-lang.org/reference/type-layout.html#r-layout.repr.rust.unspecified
///
/// # Danger of Improper Variance
///
/// You might think that subtyping is impossible between two static types,

View file

@ -271,11 +271,11 @@ pub fn spin_loop() {
crate::cfg_select! {
target_arch = "x86" => {
// SAFETY: the `cfg` attr ensures that we only execute this on x86 targets.
unsafe { crate::arch::x86::_mm_pause() }
crate::arch::x86::_mm_pause()
}
target_arch = "x86_64" => {
// SAFETY: the `cfg` attr ensures that we only execute this on x86_64 targets.
unsafe { crate::arch::x86_64::_mm_pause() }
crate::arch::x86_64::_mm_pause()
}
target_arch = "riscv32" => crate::arch::riscv32::pause(),
target_arch = "riscv64" => crate::arch::riscv64::pause(),

472
library/core/src/index.rs Normal file
View file

@ -0,0 +1,472 @@
#![unstable(feature = "sliceindex_wrappers", issue = "146179")]
//! Helper types for indexing slices.
use crate::intrinsics::slice_get_unchecked;
use crate::slice::SliceIndex;
use crate::{cmp, ops, range};
/// Clamps an index, guaranteeing that it will only access valid elements of the slice.
///
/// # Examples
///
/// ```
/// #![feature(sliceindex_wrappers)]
///
/// use core::index::Clamp;
///
/// let s: &[usize] = &[0, 1, 2, 3];
///
/// assert_eq!(&3, &s[Clamp(6)]);
/// assert_eq!(&[1, 2, 3], &s[Clamp(1..6)]);
/// assert_eq!(&[] as &[usize], &s[Clamp(5..6)]);
/// assert_eq!(&[0, 1, 2, 3], &s[Clamp(..6)]);
/// assert_eq!(&[0, 1, 2, 3], &s[Clamp(..=6)]);
/// assert_eq!(&[] as &[usize], &s[Clamp(6..)]);
/// ```
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
#[derive(Debug)]
pub struct Clamp<Idx>(pub Idx);
/// Always accesses the last element of the slice.
///
/// # Examples
///
/// ```
/// #![feature(sliceindex_wrappers)]
/// #![feature(slice_index_methods)]
///
/// use core::index::Last;
/// use core::slice::SliceIndex;
///
/// let s = &[0, 1, 2, 3];
///
/// assert_eq!(&3, &s[Last]);
/// assert_eq!(None, Last.get(&[] as &[usize]));
///
/// ```
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
#[derive(Debug)]
pub struct Last;
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<usize> {
type Output = T;
fn get(self, slice: &[T]) -> Option<&Self::Output> {
slice.get(cmp::min(self.0, slice.len() - 1))
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
slice.get_mut(cmp::min(self.0, slice.len() - 1))
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { slice_get_unchecked(slice, cmp::min(self.0, slice.len() - 1)) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { slice_get_unchecked(slice, cmp::min(self.0, slice.len() - 1)) }
}
fn index(self, slice: &[T]) -> &Self::Output {
&(*slice)[cmp::min(self.0, slice.len() - 1)]
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
&mut (*slice)[cmp::min(self.0, slice.len() - 1)]
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<range::Range<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
// SAFETY: a range ending before len is always valid
unsafe { (start..end).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
// SAFETY: a range ending before len is always valid
unsafe { (start..end).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<ops::Range<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
// SAFETY: a range ending before len is always valid
unsafe { (start..end).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
// SAFETY: a range ending before len is always valid
unsafe { (start..end).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
let start = cmp::min(self.0.start, slice.len());
let end = cmp::min(self.0.end, slice.len());
(start..end).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeInclusive<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.last, slice.len() - 1);
(start..=end).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.last, slice.len() - 1);
(start..=end).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.last, slice.len() - 1);
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (start..=end).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.last, slice.len() - 1);
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (start..=end).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.last, slice.len() - 1);
(start..=end).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.last, slice.len() - 1);
(start..=end).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<ops::RangeInclusive<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.end, slice.len() - 1);
(start..=end).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.end, slice.len() - 1);
(start..=end).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.end, slice.len() - 1);
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (start..=end).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.end, slice.len() - 1);
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (start..=end).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.end, slice.len() - 1);
(start..=end).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
let start = cmp::min(self.0.start, slice.len() - 1);
let end = cmp::min(self.0.end, slice.len() - 1);
(start..=end).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeFrom<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
(cmp::min(self.0.start, slice.len())..).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
(cmp::min(self.0.start, slice.len())..).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: a range starting at len is valid
unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: a range starting at len is valid
unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
(cmp::min(self.0.start, slice.len())..).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
(cmp::min(self.0.start, slice.len())..).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<ops::RangeFrom<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
(cmp::min(self.0.start, slice.len())..).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
(cmp::min(self.0.start, slice.len())..).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: a range starting at len is valid
unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: a range starting at len is valid
unsafe { (cmp::min(self.0.start, slice.len())..).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
(cmp::min(self.0.start, slice.len())..).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
(cmp::min(self.0.start, slice.len())..).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeTo<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
(..cmp::min(self.0.end, slice.len())).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
(..cmp::min(self.0.end, slice.len())).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: a range ending before len is always valid
unsafe { (..cmp::min(self.0.end, slice.len())).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: a range ending before len is always valid
unsafe { (..cmp::min(self.0.end, slice.len())).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
(..cmp::min(self.0.end, slice.len())).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
(..cmp::min(self.0.end, slice.len())).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeToInclusive<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
(..=cmp::min(self.0.last, slice.len() - 1)).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
(..=cmp::min(self.0.last, slice.len() - 1)).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (..=cmp::min(self.0.last, slice.len() - 1)).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (..=cmp::min(self.0.last, slice.len() - 1)).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
(..=cmp::min(self.0.last, slice.len() - 1)).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
(..=cmp::min(self.0.last, slice.len() - 1)).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<ops::RangeToInclusive<usize>> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
(..=cmp::min(self.0.end, slice.len() - 1)).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
(..=cmp::min(self.0.end, slice.len() - 1)).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (..=cmp::min(self.0.end, slice.len() - 1)).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { (..=cmp::min(self.0.end, slice.len() - 1)).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
(..=cmp::min(self.0.end, slice.len() - 1)).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
(..=cmp::min(self.0.end, slice.len() - 1)).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Clamp<range::RangeFull> {
type Output = [T];
fn get(self, slice: &[T]) -> Option<&Self::Output> {
(..).get(slice)
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
(..).get_mut(slice)
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: RangeFull just returns `slice` here
unsafe { (..).get_unchecked(slice) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: RangeFull just returns `slice` here
unsafe { (..).get_unchecked_mut(slice) }
}
fn index(self, slice: &[T]) -> &Self::Output {
(..).index(slice)
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
(..).index_mut(slice)
}
}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
unsafe impl<T> SliceIndex<[T]> for Last {
type Output = T;
fn get(self, slice: &[T]) -> Option<&Self::Output> {
slice.last()
}
fn get_mut(self, slice: &mut [T]) -> Option<&mut Self::Output> {
slice.last_mut()
}
unsafe fn get_unchecked(self, slice: *const [T]) -> *const Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { slice_get_unchecked(slice, slice.len() - 1) }
}
unsafe fn get_unchecked_mut(self, slice: *mut [T]) -> *mut Self::Output {
// SAFETY: the caller ensures that the slice isn't empty
unsafe { slice_get_unchecked(slice, slice.len() - 1) }
}
fn index(self, slice: &[T]) -> &Self::Output {
// N.B., use intrinsic indexing
&(*slice)[slice.len() - 1]
}
fn index_mut(self, slice: &mut [T]) -> &mut Self::Output {
// N.B., use intrinsic indexing
&mut (*slice)[slice.len() - 1]
}
}

View file

@ -294,6 +294,7 @@ pub mod cmp;
pub mod convert;
pub mod default;
pub mod error;
pub mod index;
pub mod marker;
pub mod ops;

View file

@ -650,6 +650,18 @@ impl<Idx: PartialOrd<Idx>> RangeToInclusive<Idx> {
}
}
impl<T> From<legacy::RangeToInclusive<T>> for RangeToInclusive<T> {
fn from(value: legacy::RangeToInclusive<T>) -> Self {
Self { last: value.end }
}
}
impl<T> From<RangeToInclusive<T>> for legacy::RangeToInclusive<T> {
fn from(value: RangeToInclusive<T>) -> Self {
Self { end: value.last }
}
}
// RangeToInclusive<Idx> cannot impl From<RangeTo<Idx>>
// because underflow would be possible with (..0).into()

View file

@ -135,6 +135,11 @@ mod private_slice_index {
impl Sealed for range::RangeFrom<usize> {}
impl Sealed for ops::IndexRange {}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
impl Sealed for crate::index::Last {}
#[unstable(feature = "sliceindex_wrappers", issue = "146179")]
impl<T> Sealed for crate::index::Clamp<T> where T: Sealed {}
}
/// A helper trait used for indexing operations.

View file

@ -0,0 +1,83 @@
use core::index::Clamp;
use core::range;
use core::slice::SliceIndex;
macro_rules! test_clamp {
($range:expr, $(($slice:expr, $other:expr)),+) => {
$(
assert_eq!(Clamp($range.clone()).get(&$slice as &[_]), $other.get(&$slice as &[_]));
assert_eq!(Clamp($range.clone()).get_mut(&mut $slice as &mut [_]), $other.get_mut(&mut $slice as &mut [_]));
unsafe {
assert_eq!(&*Clamp($range.clone()).get_unchecked(&$slice as &[_]), &*$other.get_unchecked(&$slice as &[_]));
assert_eq!(&*Clamp($range.clone()).get_unchecked_mut(&mut $slice as &mut [_]), &*$other.get_unchecked_mut(&mut $slice as &mut [_]));
}
assert_eq!(Clamp($range.clone()).index(&$slice as &[_]), $other.index(&$slice as &[_]));
assert_eq!(Clamp($range.clone()).index_mut(&mut $slice as &mut [_]), $other.index_mut(&mut $slice as &mut [_]));
)+
};
}
#[test]
fn test_clamp_usize() {
test_clamp!(2, ([0, 1], 1), ([0, 1, 2], 2));
}
#[test]
fn test_clamp_range_range() {
test_clamp!(range::Range::from(1..4), ([0, 1], 1..2), ([0, 1, 2, 3, 4], 1..4), ([0], 1..1));
}
#[test]
fn test_clamp_ops_range() {
test_clamp!(1..4, ([0, 1], 1..2), ([0, 1, 2, 3, 4], 1..4), ([0], 1..1));
}
#[test]
fn test_clamp_range_range_inclusive() {
test_clamp!(
range::RangeInclusive::from(1..=3),
([0, 1], 1..=1),
([0, 1, 2, 3, 4], 1..=3),
([0], 0..=0)
);
}
#[test]
fn test_clamp_ops_range_inclusive() {
test_clamp!(1..=3, ([0, 1], 1..=1), ([0, 1, 2, 3, 4], 1..=3), ([0], 0..=0));
}
#[test]
fn test_clamp_range_range_from() {
test_clamp!(range::RangeFrom::from(1..), ([0, 1], 1..), ([0, 1, 2, 3, 4], 1..), ([0], 1..));
}
#[test]
fn test_clamp_ops_range_from() {
test_clamp!(1.., ([0, 1], 1..), ([0, 1, 2, 3, 4], 1..), ([0], 1..));
}
#[test]
fn test_clamp_range_to() {
test_clamp!(..4, ([0, 1], ..2), ([0, 1, 2, 3, 4], ..4), ([0], ..1));
}
#[test]
fn test_clamp_range_range_to_inclusive() {
test_clamp!(
range::RangeToInclusive::from(..=4),
([0, 1], ..=1),
([0, 1, 2, 3, 4], ..=4),
([0], ..=0)
);
}
#[test]
fn test_clamp_ops_range_to_inclusive() {
test_clamp!(..=4, ([0, 1], ..=1), ([0, 1, 2, 3, 4], ..=4), ([0], ..=0));
}
#[test]
fn test_clamp_range_full() {
test_clamp!(.., ([0, 1], ..), ([0, 1, 2, 3, 4], ..), ([0], ..));
}

View file

@ -85,6 +85,7 @@
#![feature(maybe_uninit_write_slice)]
#![feature(min_specialization)]
#![feature(never_type)]
#![feature(new_range_api)]
#![feature(next_index)]
#![feature(non_exhaustive_omitted_patterns_lint)]
#![feature(numfmt)]
@ -97,9 +98,11 @@
#![feature(ptr_metadata)]
#![feature(result_option_map_or_default)]
#![feature(slice_from_ptr_range)]
#![feature(slice_index_methods)]
#![feature(slice_internals)]
#![feature(slice_partition_dedup)]
#![feature(slice_split_once)]
#![feature(sliceindex_wrappers)]
#![feature(split_array)]
#![feature(split_as_slice)]
#![feature(std_internals)]
@ -178,6 +181,7 @@ mod fmt;
mod future;
mod hash;
mod hint;
mod index;
mod intrinsics;
mod io;
mod iter;

View file

@ -249,6 +249,43 @@ jobs:
env:
TARGET: ${{ matrix.target.tuple }}
intrinsic-test:
needs: [style]
name: Intrinsic Test
runs-on: ubuntu-latest
strategy:
matrix:
target:
- aarch64-unknown-linux-gnu
- aarch64_be-unknown-linux-gnu
- armv7-unknown-linux-gnueabihf
- arm-unknown-linux-gnueabihf
- x86_64-unknown-linux-gnu
profile: [dev, release]
include:
- target: aarch64_be-unknown-linux-gnu
build_std: true
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: |
rustup update nightly --no-self-update
rustup default nightly
- run: rustup target add ${{ matrix.target }}
if: ${{ (matrix.build_std || false) == false }}
- run: |
rustup component add rust-src
echo "CARGO_UNSTABLE_BUILD_STD=std" >> $GITHUB_ENV
if: ${{ matrix.build_std }}
# Configure some env vars based on matrix configuration
- run: echo "PROFILE=--profile=${{ matrix.profile }}" >> $GITHUB_ENV
- run: ./ci/intrinsic-test-docker.sh ${{ matrix.target }}
if: ${{ !startsWith(matrix.target, 'thumb') }}
env:
TARGET: ${{ matrix.target }}
# Check that the generated files agree with the checked-in versions.
check-stdarch-gen:
needs: [style]
@ -276,6 +313,7 @@ jobs:
- docs
- verify
- test
- intrinsic-test
- check-stdarch-gen
runs-on: ubuntu-latest
# We need to ensure this job does *not* get skipped if its dependencies fail,

View file

@ -347,8 +347,11 @@ dependencies = [
"itertools",
"log",
"pretty_env_logger",
"quick-xml 0.37.5",
"rayon",
"regex",
"serde",
"serde-xml-rs",
"serde_json",
]
@ -404,9 +407,9 @@ checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
[[package]]
name = "memchr"
version = "2.7.5"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "once_cell_polyfill"
@ -452,6 +455,16 @@ dependencies = [
"serde",
]
[[package]]
name = "quick-xml"
version = "0.37.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "quickcheck"
version = "1.0.3"
@ -587,6 +600,18 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-xml-rs"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53630160a98edebde0123eb4dfd0fce6adff091b2305db3154a9e920206eb510"
dependencies = [
"log",
"serde",
"thiserror",
"xml-rs",
]
[[package]]
name = "serde_derive"
version = "1.0.219"
@ -698,7 +723,7 @@ name = "stdarch-verify"
version = "0.1.0"
dependencies = [
"proc-macro2",
"quick-xml",
"quick-xml 0.33.0",
"quote",
"serde",
"serde_json",
@ -746,6 +771,26 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "thiserror"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
@ -958,6 +1003,12 @@ version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
[[package]]
name = "xml-rs"
version = "0.8.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7"
[[package]]
name = "yaml-rust"
version = "0.4.5"

View file

@ -7,9 +7,9 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
xz-utils \
clang
ENV VERSION=v34.0.1
ENV VERSION=v38.0.3
RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/${VERSION}/wasmtime-${VERSION}-x86_64-linux.tar.xz | tar xJf -
ENV PATH=$PATH:/wasmtime-${VERSION}-x86_64-linux
ENV CARGO_TARGET_WASM32_WASIP1_RUNNER="wasmtime --dir /checkout/target/wasm32-wasip1/release/deps::."
ENV CARGO_TARGET_WASM32_WASIP1_RUNNER="wasmtime -Wexceptions --dir /checkout/target/wasm32-wasip1/release/deps::."

View file

@ -6,7 +6,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
make \
ca-certificates \
wget \
xz-utils
xz-utils \
clang \
libstdc++-14-dev \
build-essential \
lld
RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.58.0-2025-06-16-lin.tar.xz -O sde.tar.xz
RUN mkdir intel-sde
@ -14,5 +18,6 @@ RUN tar -xJf sde.tar.xz --strip-components=1 -C intel-sde
ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/intel-sde/sde64 \
-cpuid-in /checkout/ci/docker/x86_64-unknown-linux-gnu/cpuid.def \
-rtm-mode full -tsx --"
# These tests fail with SDE as it doesn't support saving register data
ENV STDARCH_TEST_SKIP_FUNCTION="xsave,xsaveopt,xsave64,xsaveopt64"
# SDE doesn't support AMD extensions
# FIXME: find a way to test these
ENV STDARCH_TEST_SKIP_FEATURE="sse4a,tbm,xop"

View file

@ -12,7 +12,7 @@
# CPUID_VERSION = 1.0
# Input => Output
# EAX ECX => EAX EBX ECX EDX
00000000 ******** => 00000024 68747541 444d4163 69746e65
00000000 ******** => 00000024 756e6547 6c65746e 49656e69
00000001 ******** => 00400f10 00100800 7ffaf3ff bfebfbff
00000002 ******** => 76035a01 00f0b6ff 00000000 00c10000
00000003 ******** => 00000000 00000000 00000000 00000000
@ -49,7 +49,7 @@
00000024 00000000 => 00000001 00070002 00000000 00000000 #AVX10
00000024 00000001 => 00000000 00000000 00000004 00000000
80000000 ******** => 80000008 00000000 00000000 00000000
80000001 ******** => 00000000 00000000 00200961 2c100000
80000001 ******** => 00000000 00000000 00000121 2c100000
80000002 ******** => 00000000 00000000 00000000 00000000
80000003 ******** => 00000000 00000000 00000000 00000000
80000004 ******** => 00000000 00000000 00000000 00000000
@ -59,5 +59,4 @@
80000008 ******** => 00003028 00000200 00000200 00000000
# This file was copied from intel-sde/misc/cpuid/future/cpuid.def, and modified to
# use "AuthenticAMD" as the vendor and the support for `XOP`, `SSE4a`, `TBM`,
# `AVX512_VP2INTERSECT` and the VEX variants of AVX512 was added in the CPUID.
# add support for `AVX512_VP2INTERSECT`

View file

@ -0,0 +1,57 @@
#!/usr/bin/env sh
# Small script to run tests for a target (or all targets) inside all the
# respective docker images.
set -ex
if [ $# -lt 1 ]; then
>&2 echo "Usage: $0 <TARGET>"
exit 1
fi
run() {
# Set the linker that is used for the host (e.g. when compiling a build.rs)
# This overrides any configuration in e.g. `.cargo/config.toml`, which will
# probably not work within the docker container.
HOST_LINKER="CARGO_TARGET_$(rustc --print host-tuple | tr '[:lower:]-' '[:upper:]_')_LINKER"
# Prevent `Read-only file system (os error 30)`.
cargo generate-lockfile
echo "Building docker container for TARGET=${1}"
docker build -t stdarch -f "ci/docker/${1}/Dockerfile" ci/
mkdir -p target c_programs rust_programs
echo "Running docker"
# shellcheck disable=SC2016
docker run \
--rm \
--user "$(id -u)":"$(id -g)" \
--env CARGO_HOME=/cargo \
--env CARGO_TARGET_DIR=/checkout/target \
--env TARGET="${1}" \
--env "${HOST_LINKER}"="cc" \
--env STDARCH_DISABLE_ASSERT_INSTR \
--env NOSTD \
--env NORUN \
--env RUSTFLAGS \
--env CARGO_UNSTABLE_BUILD_STD \
--volume "${HOME}/.cargo":/cargo \
--volume "$(rustc --print sysroot)":/rust:ro \
--volume "$(pwd)":/checkout:ro \
--volume "$(pwd)"/target:/checkout/target \
--volume "$(pwd)"/c_programs:/checkout/c_programs \
--volume "$(pwd)"/rust_programs:/checkout/rust_programs \
--init \
--workdir /checkout \
--privileged \
stdarch \
sh -c "HOME=/tmp PATH=\$PATH:/rust/bin exec ci/intrinsic-test.sh ${1}"
}
if [ -z "$1" ]; then
>&2 echo "No target specified!"
exit 1
else
run "${1}"
fi

View file

@ -0,0 +1,123 @@
#!/usr/bin/env sh
set -ex
: "${TARGET?The TARGET environment variable must be set.}"
export RUSTFLAGS="${RUSTFLAGS} -D warnings -Z merge-functions=disabled -Z verify-llvm-ir"
export HOST_RUSTFLAGS="${RUSTFLAGS}"
export PROFILE="${PROFILE:="--profile=release"}"
case ${TARGET} in
# On 32-bit use a static relocation model which avoids some extra
# instructions when dealing with static data, notably allowing some
# instruction assertion checks to pass below the 20 instruction limit. If
# this is the default, dynamic, then too many instructions are generated
# when we assert the instruction for a function and it causes tests to fail.
i686-* | i586-*)
export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static"
;;
# Some x86_64 targets enable by default more features beyond SSE2,
# which cause some instruction assertion checks to fail.
x86_64-*)
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=-sse3"
;;
#Unoptimized build uses fast-isel which breaks with msa
mips-* | mipsel-*)
export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
;;
armv7-*eabihf | thumbv7-*eabihf)
export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
;;
# Some of our test dependencies use the deprecated `gcc` crates which
# doesn't detect RISC-V compilers automatically, so do it manually here.
riscv*)
export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zks,+zbb,+zbc"
;;
esac
echo "RUSTFLAGS=${RUSTFLAGS}"
echo "OBJDUMP=${OBJDUMP}"
echo "PROFILE=${PROFILE}"
INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"
# Test targets compiled with extra features.
case ${TARGET} in
# Setup aarch64 & armv7 specific variables, the runner, along with some
# tests to skip
aarch64-unknown-linux-gnu*)
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}"
;;
aarch64_be-unknown-linux-gnu*)
TEST_CPPFLAGS="-fuse-ld=lld"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}"
;;
armv7-unknown-linux-gnueabihf*)
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}"
;;
x86_64-unknown-linux-gnu*)
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/"
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt
TEST_SAMPLE_INTRINSICS_PERCENTAGE=5
;;
*)
;;
esac
# Arm specific
case "${TARGET}" in
aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*)
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
--bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
--runner "${TEST_RUNNER}" \
--cppcompiler "${TEST_CXX_COMPILER}" \
--skip "${TEST_SKIP_INTRINSICS}" \
--target "${TARGET}"
;;
aarch64_be-unknown-linux-gnu*)
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
--bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
--runner "${TEST_RUNNER}" \
--cppcompiler "${TEST_CXX_COMPILER}" \
--skip "${TEST_SKIP_INTRINSICS}" \
--target "${TARGET}" \
--linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
--cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
;;
x86_64-unknown-linux-gnu*)
# `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER` is not necessary for `intrinsic-test`
# because the binary needs to run directly on the host.
# Hence the use of `env -u`.
env -u CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER \
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" \
RUST_LOG=warn RUST_BACKTRACE=1 \
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
--bin intrinsic-test -- intrinsics_data/x86-intel.xml \
--runner "${TEST_RUNNER}" \
--skip "${TEST_SKIP_INTRINSICS}" \
--cppcompiler "${TEST_CXX_COMPILER}" \
--target "${TARGET}" \
--sample-percentage "${TEST_SAMPLE_INTRINSICS_PERCENTAGE}"
;;
*)
;;
esac

View file

@ -79,7 +79,6 @@ cargo_test() {
CORE_ARCH="--manifest-path=crates/core_arch/Cargo.toml"
STDARCH_EXAMPLES="--manifest-path=examples/Cargo.toml"
INTRINSIC_TEST="--manifest-path=crates/intrinsic-test/Cargo.toml"
cargo_test "${CORE_ARCH} ${PROFILE}"
@ -130,61 +129,11 @@ case ${TARGET} in
export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+altivec"
cargo_test "${PROFILE}"
;;
# Setup aarch64 & armv7 specific variables, the runner, along with some
# tests to skip
aarch64-unknown-linux-gnu*)
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}"
;;
aarch64_be-unknown-linux-gnu*)
TEST_CPPFLAGS="-fuse-ld=lld"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}"
;;
armv7-unknown-linux-gnueabihf*)
TEST_CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt
TEST_CXX_COMPILER="clang++"
TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}"
;;
*)
;;
esac
# Arm specific
case "${TARGET}" in
aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*)
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
--bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
--runner "${TEST_RUNNER}" \
--cppcompiler "${TEST_CXX_COMPILER}" \
--skip "${TEST_SKIP_INTRINSICS}" \
--target "${TARGET}"
;;
aarch64_be-unknown-linux-gnu*)
CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \
cargo run "${INTRINSIC_TEST}" "${PROFILE}" \
--bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \
--runner "${TEST_RUNNER}" \
--cppcompiler "${TEST_CXX_COMPILER}" \
--skip "${TEST_SKIP_INTRINSICS}" \
--target "${TARGET}" \
--linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
--cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
;;
*)
;;
esac
if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
# Test examples
(

View file

@ -44,9 +44,14 @@ use crate::arch::asm;
#[inline]
#[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
pub fn pause() {
// Use `.option` directives to expose this HINT instruction
// (no-op if not supported by the hardware) without `#[target_feature]`.
unsafe {
asm!(
".insn i 0x0F, 0, x0, x0, 0x010",
".option push",
".option arch, +zihintpause",
"pause",
".option pop",
options(nomem, nostack, preserves_flags)
);
}

View file

@ -60,26 +60,6 @@ struct PackedTuple<T, U> {
#[allow(improper_ctypes)]
#[rustfmt::skip]
unsafe extern "unadjusted" {
#[link_name = "llvm.smax.v16i8"] fn vmxb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
#[link_name = "llvm.smax.v8i16"] fn vmxh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
#[link_name = "llvm.smax.v4i32"] fn vmxf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
#[link_name = "llvm.smax.v2i64"] fn vmxg(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
#[link_name = "llvm.umax.v16i8"] fn vmxlb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
#[link_name = "llvm.umax.v8i16"] fn vmxlh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
#[link_name = "llvm.umax.v4i32"] fn vmxlf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
#[link_name = "llvm.umax.v2i64"] fn vmxlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
#[link_name = "llvm.smin.v16i8"] fn vmnb(a: vector_signed_char, b: vector_signed_char) -> vector_signed_char;
#[link_name = "llvm.smin.v8i16"] fn vmnh(a: vector_signed_short, b: vector_signed_short) -> vector_signed_short;
#[link_name = "llvm.smin.v4i32"] fn vmnf(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
#[link_name = "llvm.smin.v2i64"] fn vmng(a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long;
#[link_name = "llvm.umin.v16i8"] fn vmnlb(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
#[link_name = "llvm.umin.v8i16"] fn vmnlh(a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short;
#[link_name = "llvm.umin.v4i32"] fn vmnlf(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int;
#[link_name = "llvm.umin.v2i64"] fn vmnlg(a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long;
#[link_name = "llvm.nearbyint.v4f32"] fn nearbyint_v4f32(a: vector_float) -> vector_float;
#[link_name = "llvm.nearbyint.v2f64"] fn nearbyint_v2f64(a: vector_double) -> vector_double;
@ -683,17 +663,40 @@ mod sealed {
unsafe fn vec_max(self, b: Other) -> Self::Result;
}
test_impl! { vec_vmxsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmxb, vmxb] }
test_impl! { vec_vmxsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmxh, vmxh] }
test_impl! { vec_vmxsf (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmxf, vmxf] }
test_impl! { vec_vmxsg (a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [vmxg, vmxg] }
macro_rules! impl_max {
($name:ident, $a:ty, $instr:ident) => {
#[inline]
#[target_feature(enable = "vector")]
#[cfg_attr(test, assert_instr($instr))]
pub unsafe fn $name(a: $a, b: $a) -> $a {
simd_select(simd_ge::<_, $a>(a, b), a, b)
}
test_impl! { vec_vmxslb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmxlb, vmxlb] }
test_impl! { vec_vmxslh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmxlh, vmxlh] }
test_impl! { vec_vmxslf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmxlf, vmxlf] }
test_impl! { vec_vmxslg (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vmxlg, vmxlg] }
#[unstable(feature = "stdarch_s390x", issue = "135681")]
impl VectorMax<Self> for $a {
type Result = Self;
impl_vec_trait! { [VectorMax vec_max] ~(vmxlb, vmxb, vmxlh, vmxh, vmxlf, vmxf, vmxlg, vmxg) }
#[inline]
#[target_feature(enable = "vector")]
unsafe fn vec_max(self, other: Self) -> Self {
$name(self, other)
}
}
};
}
mod impl_max {
use super::*;
impl_max!(vec_vmxsc, vector_signed_char, vmxb);
impl_max!(vec_vmxslc, vector_unsigned_char, vmxlb);
impl_max!(vec_vmxsh, vector_signed_short, vmxh);
impl_max!(vec_vmxslh, vector_unsigned_short, vmxlh);
impl_max!(vec_vmxsf, vector_signed_int, vmxf);
impl_max!(vec_vmxslf, vector_unsigned_int, vmxlf);
impl_max!(vec_vmxsg, vector_signed_long_long, vmxg);
impl_max!(vec_vmxslg, vector_unsigned_long_long, vmxlg);
}
test_impl! { vec_vfmaxsb (a: vector_float, b: vector_float) -> vector_float [simd_fmax, "vector-enhancements-1" vfmaxsb ] }
test_impl! { vec_vfmaxdb (a: vector_double, b: vector_double) -> vector_double [simd_fmax, "vector-enhancements-1" vfmaxdb] }
@ -707,17 +710,40 @@ mod sealed {
unsafe fn vec_min(self, b: Other) -> Self::Result;
}
test_impl! { vec_vmnsb (a: vector_signed_char, b: vector_signed_char) -> vector_signed_char [vmnb, vmnb] }
test_impl! { vec_vmnsh (a: vector_signed_short, b: vector_signed_short) -> vector_signed_short [vmnh, vmnh] }
test_impl! { vec_vmnsf (a: vector_signed_int, b: vector_signed_int) -> vector_signed_int [vmnf, vmnf] }
test_impl! { vec_vmnsg (a: vector_signed_long_long, b: vector_signed_long_long) -> vector_signed_long_long [vmng, vmng] }
macro_rules! impl_min {
($name:ident, $a:ty, $instr:ident) => {
#[inline]
#[target_feature(enable = "vector")]
#[cfg_attr(test, assert_instr($instr))]
pub unsafe fn $name(a: $a, b: $a) -> $a {
simd_select(simd_le::<_, $a>(a, b), a, b)
}
test_impl! { vec_vmnslb (a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char [vmnlb, vmnlb] }
test_impl! { vec_vmnslh (a: vector_unsigned_short, b: vector_unsigned_short) -> vector_unsigned_short [vmnlh, vmnlh] }
test_impl! { vec_vmnslf (a: vector_unsigned_int, b: vector_unsigned_int) -> vector_unsigned_int [vmnlf, vmnlf] }
test_impl! { vec_vmnslg (a: vector_unsigned_long_long, b: vector_unsigned_long_long) -> vector_unsigned_long_long [vmnlg, vmnlg] }
#[unstable(feature = "stdarch_s390x", issue = "135681")]
impl VectorMin<Self> for $a {
type Result = Self;
impl_vec_trait! { [VectorMin vec_min] ~(vmxlb, vmxb, vmxlh, vmxh, vmxlf, vmxf, vmxlg, vmxg) }
#[inline]
#[target_feature(enable = "vector")]
unsafe fn vec_min(self, other: Self) -> Self {
$name(self, other)
}
}
};
}
mod impl_min {
use super::*;
impl_min!(vec_vmnsc, vector_signed_char, vmnb);
impl_min!(vec_vmnslc, vector_unsigned_char, vmnlb);
impl_min!(vec_vmnsh, vector_signed_short, vmnh);
impl_min!(vec_vmnslh, vector_unsigned_short, vmnlh);
impl_min!(vec_vmnsf, vector_signed_int, vmnf);
impl_min!(vec_vmnslf, vector_unsigned_int, vmnlf);
impl_min!(vec_vmnsg, vector_signed_long_long, vmng);
impl_min!(vec_vmnslg, vector_unsigned_long_long, vmnlg);
}
test_impl! { vec_vfminsb (a: vector_float, b: vector_float) -> vector_float [simd_fmin, "vector-enhancements-1" vfminsb] }
test_impl! { vec_vfmindb (a: vector_double, b: vector_double) -> vector_double [simd_fmin, "vector-enhancements-1" vfmindb] }
@ -2368,17 +2394,13 @@ mod sealed {
unsafe fn vec_packsu(self, b: Other) -> Self::Result;
}
unsafe fn simd_smax<T: Copy>(a: T, b: T) -> T {
simd_select::<T, T>(simd_gt::<T, T>(a, b), a, b)
}
#[inline]
#[target_feature(enable = "vector")]
#[cfg_attr(test, assert_instr(vpklsh))]
unsafe fn vpacksuh(a: vector_signed_short, b: vector_signed_short) -> vector_unsigned_char {
vpklsh(
simd_smax(a, vector_signed_short([0; 8])),
simd_smax(b, vector_signed_short([0; 8])),
vec_max(a, vector_signed_short([0; 8])),
vec_max(b, vector_signed_short([0; 8])),
)
}
#[inline]
@ -2386,8 +2408,8 @@ mod sealed {
#[cfg_attr(test, assert_instr(vpklsf))]
unsafe fn vpacksuf(a: vector_signed_int, b: vector_signed_int) -> vector_unsigned_short {
vpklsf(
simd_smax(a, vector_signed_int([0; 4])),
simd_smax(b, vector_signed_int([0; 4])),
vec_max(a, vector_signed_int([0; 4])),
vec_max(b, vector_signed_int([0; 4])),
)
}
#[inline]
@ -2398,8 +2420,8 @@ mod sealed {
b: vector_signed_long_long,
) -> vector_unsigned_int {
vpklsg(
simd_smax(a, vector_signed_long_long([0; 2])),
simd_smax(b, vector_signed_long_long([0; 2])),
vec_max(a, vector_signed_long_long([0; 2])),
vec_max(b, vector_signed_long_long([0; 2])),
)
}

View file

@ -5,8 +5,6 @@ use stdarch_test::assert_instr;
unsafe extern "unadjusted" {
#[link_name = "llvm.x86.addcarry.32"]
fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
#[link_name = "llvm.x86.addcarryx.u32"]
fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
#[link_name = "llvm.x86.subborrow.32"]
fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
}
@ -19,8 +17,8 @@ unsafe extern "unadjusted" {
#[inline]
#[cfg_attr(test, assert_instr(adc))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
let (a, b) = llvm_addcarry_u32(c_in, a, b);
pub fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
let (a, b) = unsafe { llvm_addcarry_u32(c_in, a, b) };
*out = b;
a
}
@ -34,8 +32,8 @@ pub unsafe fn _addcarry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
#[target_feature(enable = "adx")]
#[cfg_attr(test, assert_instr(adc))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
llvm_addcarryx_u32(c_in, a, b, out as *mut _)
pub fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
_addcarry_u32(c_in, a, b, out)
}
/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
@ -46,8 +44,8 @@ pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
#[inline]
#[cfg_attr(test, assert_instr(sbb))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
let (a, b) = llvm_subborrow_u32(c_in, a, b);
pub fn _subborrow_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
let (a, b) = unsafe { llvm_subborrow_u32(c_in, a, b) };
*out = b;
a
}
@ -60,38 +58,36 @@ mod tests {
#[test]
fn test_addcarry_u32() {
unsafe {
let a = u32::MAX;
let mut out = 0;
let a = u32::MAX;
let mut out = 0;
let r = _addcarry_u32(0, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarry_u32(0, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarry_u32(0, a, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, a);
let r = _addcarry_u32(0, a, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, a);
let r = _addcarry_u32(1, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 1);
let r = _addcarry_u32(1, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 1);
let r = _addcarry_u32(1, a, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarry_u32(1, a, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarry_u32(0, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 7);
let r = _addcarry_u32(0, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 7);
let r = _addcarry_u32(1, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 8);
}
let r = _addcarry_u32(1, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 8);
}
#[simd_test(enable = "adx")]
unsafe fn test_addcarryx_u32() {
fn test_addcarryx_u32() {
let a = u32::MAX;
let mut out = 0;
@ -121,44 +117,39 @@ mod tests {
}
#[simd_test(enable = "adx")]
unsafe fn test_addcarryx_u32_2() {
unsafe fn add_1_2_3() -> u32 {
let mut out = 0;
_addcarryx_u32(1, 2, 3, &mut out);
out
}
assert_eq!(6, add_1_2_3());
fn test_addcarryx_u32_2() {
let mut out = 0;
_addcarryx_u32(1, 2, 3, &mut out);
assert_eq!(6, out);
}
#[test]
fn test_subborrow_u32() {
unsafe {
let a = u32::MAX;
let mut out = 0;
let a = u32::MAX;
let mut out = 0;
let r = _subborrow_u32(0, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u32(0, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u32(0, 0, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 0);
let r = _subborrow_u32(0, 0, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 0);
let r = _subborrow_u32(1, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a - 1);
let r = _subborrow_u32(1, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a - 1);
let r = _subborrow_u32(1, 0, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u32(1, 0, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u32(0, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 4);
let r = _subborrow_u32(0, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 4);
let r = _subborrow_u32(1, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 3);
}
let r = _subborrow_u32(1, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 3);
}
}

View file

@ -587,7 +587,11 @@ pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vhaddpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
unsafe { vhaddpd(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
simd_add(even, odd)
}
}
/// Horizontal addition of adjacent pairs in the two packed vectors
@ -602,7 +606,11 @@ pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vhaddps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
unsafe { vhaddps(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
simd_add(even, odd)
}
}
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@ -616,7 +624,11 @@ pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vhsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
unsafe { vhsubpd(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
simd_sub(even, odd)
}
}
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@ -631,7 +643,11 @@ pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vhsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
unsafe { vhsubps(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
simd_sub(even, odd)
}
}
/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@ -1218,7 +1234,10 @@ pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
static_assert_uimm_bits!(IMM8, 8);
unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
_mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
_mm256_castps_si256(a),
_mm256_castps_si256(b),
))
}
/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@ -1232,7 +1251,10 @@ pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
static_assert_uimm_bits!(IMM8, 8);
unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
_mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
_mm256_castpd_si256(a),
_mm256_castpd_si256(b),
))
}
/// Shuffles 128-bits (composed of integer data) selected by `imm8`
@ -1246,7 +1268,35 @@ pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
static_assert_uimm_bits!(IMM8, 8);
unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
const fn idx(imm8: i32, pos: u32) -> u32 {
let part = if pos < 2 {
imm8 & 0xf
} else {
(imm8 & 0xf0) >> 4
};
2 * (part as u32 & 0b11) + (pos & 1)
}
const fn idx0(imm8: i32, pos: u32) -> u32 {
let part = if pos < 2 {
imm8 & 0xf
} else {
(imm8 & 0xf0) >> 4
};
if part & 0b1000 != 0 { 4 } else { pos }
}
unsafe {
let r = simd_shuffle!(
a.as_i64x4(),
b.as_i64x4(),
[idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
);
let r: i64x4 = simd_shuffle!(
r,
i64x4::ZERO,
[idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
);
r.as_m256i()
}
}
/// Broadcasts a single-precision (32-bit) floating-point element from memory
@ -1783,6 +1833,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vmovntdq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntdq", ",{a}"),
p = in(reg) mem_addr,
@ -1811,6 +1862,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntpd", ",{a}"),
p = in(reg) mem_addr,
@ -1840,6 +1892,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntps", ",{a}"),
p = in(reg) mem_addr,
@ -1933,7 +1986,10 @@ pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
unsafe {
let r = simd_and(a.as_i64x4(), b.as_i64x4());
(0i64 == simd_reduce_or(r)) as i32
}
}
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@ -1947,7 +2003,10 @@ pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
#[cfg_attr(test, assert_instr(vptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
unsafe {
let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
(0i64 == simd_reduce_or(r)) as i32
}
}
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@ -2031,7 +2090,10 @@ pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
#[cfg_attr(test, assert_instr(vtestpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
unsafe { vtestzpd(a, b) }
unsafe {
let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
(0i64 == simd_reduce_or(r)) as i32
}
}
/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@ -2048,7 +2110,10 @@ pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
#[cfg_attr(test, assert_instr(vtestpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
unsafe { vtestcpd(a, b) }
unsafe {
let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
(0i64 == simd_reduce_or(r)) as i32
}
}
/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@ -2135,7 +2200,10 @@ pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
#[cfg_attr(test, assert_instr(vtestps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
unsafe { vtestzps(a, b) }
unsafe {
let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
(0i32 == simd_reduce_or(r)) as i32
}
}
/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@ -2152,7 +2220,10 @@ pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
#[cfg_attr(test, assert_instr(vtestps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
unsafe { vtestcps(a, b) }
unsafe {
let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
(0i32 == simd_reduce_or(r)) as i32
}
}
/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@ -3044,14 +3115,6 @@ unsafe extern "C" {
fn roundps256(a: __m256, b: i32) -> __m256;
#[link_name = "llvm.x86.avx.dp.ps.256"]
fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
#[link_name = "llvm.x86.avx.hadd.pd.256"]
fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
#[link_name = "llvm.x86.avx.hadd.ps.256"]
fn vhaddps(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.hsub.pd.256"]
fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
#[link_name = "llvm.x86.avx.hsub.ps.256"]
fn vhsubps(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.sse2.cmp.pd"]
fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
#[link_name = "llvm.x86.avx.cmp.pd.256"]
@ -3084,12 +3147,6 @@ unsafe extern "C" {
fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
#[link_name = "llvm.x86.avx.vpermilvar.pd"]
fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
#[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
#[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
#[link_name = "llvm.x86.avx.vperm2f128.si.256"]
fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
#[link_name = "llvm.x86.avx.maskload.pd.256"]
fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
#[link_name = "llvm.x86.avx.maskstore.pd.256"]
@ -3112,10 +3169,6 @@ unsafe extern "C" {
fn vrcpps(a: __m256) -> __m256;
#[link_name = "llvm.x86.avx.rsqrt.ps.256"]
fn vrsqrtps(a: __m256) -> __m256;
#[link_name = "llvm.x86.avx.ptestz.256"]
fn ptestz256(a: i64x4, b: i64x4) -> i32;
#[link_name = "llvm.x86.avx.ptestc.256"]
fn ptestc256(a: i64x4, b: i64x4) -> i32;
#[link_name = "llvm.x86.avx.ptestnzc.256"]
fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
#[link_name = "llvm.x86.avx.vtestz.pd.256"]
@ -3124,10 +3177,6 @@ unsafe extern "C" {
fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
#[link_name = "llvm.x86.avx.vtestz.pd"]
fn vtestzpd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.avx.vtestc.pd"]
fn vtestcpd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.pd"]
fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.avx.vtestz.ps.256"]
@ -3136,10 +3185,6 @@ unsafe extern "C" {
fn vtestcps256(a: __m256, b: __m256) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
fn vtestnzcps256(a: __m256, b: __m256) -> i32;
#[link_name = "llvm.x86.avx.vtestz.ps"]
fn vtestzps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.vtestc.ps"]
fn vtestcps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.ps"]
fn vtestnzcps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.min.ps.256"]
@ -4249,6 +4294,7 @@ mod tests {
let a = _mm256_setr_epi64x(1, 2, 3, 4);
let mut r = _mm256_undefined_si256();
_mm256_stream_si256(ptr::addr_of_mut!(r), a);
_mm_sfence();
assert_eq_m256i(r, a);
}
@ -4263,6 +4309,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 4] };
_mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..4 {
assert_eq!(mem.data[i], get_m256d(a, i));
}
@ -4279,6 +4326,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 8] };
_mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..8 {
assert_eq!(mem.data[i], get_m256(a, i));
}

View file

@ -891,7 +891,21 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
#[cfg_attr(test, assert_instr(vphaddw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
let a = a.as_i16x16();
let b = b.as_i16x16();
unsafe {
let even: i16x16 = simd_shuffle!(
a,
b,
[0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
);
let odd: i16x16 = simd_shuffle!(
a,
b,
[1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
);
simd_add(even, odd).as_m256i()
}
}
/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@ -902,7 +916,13 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vphaddd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
let a = a.as_i32x8();
let b = b.as_i32x8();
unsafe {
let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
simd_add(even, odd).as_m256i()
}
}
/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@ -925,7 +945,21 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vphsubw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
let a = a.as_i16x16();
let b = b.as_i16x16();
unsafe {
let even: i16x16 = simd_shuffle!(
a,
b,
[0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
);
let odd: i16x16 = simd_shuffle!(
a,
b,
[1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
);
simd_sub(even, odd).as_m256i()
}
}
/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@ -936,7 +970,13 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vphsubd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
let a = a.as_i32x8();
let b = b.as_i32x8();
unsafe {
let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
simd_sub(even, odd).as_m256i()
}
}
/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@ -1714,7 +1754,12 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
#[cfg_attr(test, assert_instr(vpmaddwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
unsafe {
let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
simd_add(even, odd).as_m256i()
}
}
/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@ -2285,7 +2330,7 @@ pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
static_assert_uimm_bits!(IMM8, 8);
unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
_mm256_permute2f128_si256::<IMM8>(a, b)
}
/// Shuffles 64-bit floating-point elements in `a` across lanes using the
@ -3594,20 +3639,10 @@ pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
#[allow(improper_ctypes)]
unsafe extern "C" {
#[link_name = "llvm.x86.avx2.phadd.w"]
fn phaddw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.phadd.d"]
fn phaddd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.phadd.sw"]
fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.phsub.w"]
fn phsubw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.phsub.d"]
fn phsubd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.phsub.sw"]
fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmadd.wd"]
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
#[link_name = "llvm.x86.avx2.maskload.d"]
@ -3688,8 +3723,6 @@ unsafe extern "C" {
fn permd(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.permps"]
fn permps(a: __m256, b: i32x8) -> __m256;
#[link_name = "llvm.x86.avx2.vperm2i128"]
fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
#[link_name = "llvm.x86.avx2.gather.d.d"]
fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
#[link_name = "llvm.x86.avx2.gather.d.d.256"]

View file

@ -5835,7 +5835,20 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
unsafe {
let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
let even: i32x16 = simd_shuffle!(
r,
r,
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
);
let odd: i32x16 = simd_shuffle!(
r,
r,
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
);
simd_add(even, odd).as_m512i()
}
}
/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -10427,7 +10440,7 @@ pub fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
a << COUNT
a.unbounded_shl(COUNT)
}
/// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
@ -10438,7 +10451,7 @@ pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
a << COUNT
a.unbounded_shl(COUNT)
}
/// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
@ -10449,7 +10462,7 @@ pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
a >> COUNT
a.unbounded_shr(COUNT)
}
/// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
@ -10460,7 +10473,7 @@ pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
a >> COUNT
a.unbounded_shr(COUNT)
}
/// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst,
@ -11617,8 +11630,6 @@ unsafe extern "C" {
#[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
#[link_name = "llvm.x86.avx512.pmaddw.d.512"]
fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
@ -20325,6 +20336,18 @@ mod tests {
let r = _kshiftli_mask32::<3>(a);
let e: __mmask32 = 0b0100101101001011_0100101101001000;
assert_eq!(r, e);
let r = _kshiftli_mask32::<31>(a);
let e: __mmask32 = 0b1000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftli_mask32::<32>(a);
let e: __mmask32 = 0b0000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftli_mask32::<33>(a);
let e: __mmask32 = 0b0000000000000000_0000000000000000;
assert_eq!(r, e);
}
#[simd_test(enable = "avx512bw")]
@ -20333,21 +20356,61 @@ mod tests {
let r = _kshiftli_mask64::<3>(a);
let e: __mmask64 = 0b0110100101101001011_0100101101001000;
assert_eq!(r, e);
let r = _kshiftli_mask64::<63>(a);
let e: __mmask64 = 0b1000000000000000_0000000000000000_0000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftli_mask64::<64>(a);
let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftli_mask64::<65>(a);
let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
assert_eq!(r, e);
}
#[simd_test(enable = "avx512bw")]
unsafe fn test_kshiftri_mask32() {
let a: __mmask32 = 0b0110100101101001_0110100101101001;
let a: __mmask32 = 0b1010100101101001_0110100101101001;
let r = _kshiftri_mask32::<3>(a);
let e: __mmask32 = 0b0000110100101101_0010110100101101;
let e: __mmask32 = 0b0001010100101101_0010110100101101;
assert_eq!(r, e);
let r = _kshiftri_mask32::<31>(a);
let e: __mmask32 = 0b0000000000000000_0000000000000001;
assert_eq!(r, e);
let r = _kshiftri_mask32::<32>(a);
let e: __mmask32 = 0b0000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftri_mask32::<33>(a);
let e: __mmask32 = 0b0000000000000000_0000000000000000;
assert_eq!(r, e);
}
#[simd_test(enable = "avx512bw")]
unsafe fn test_kshiftri_mask64() {
let a: __mmask64 = 0b0110100101101001011_0100101101001000;
let a: __mmask64 = 0b1010100101101001011_0100101101001000;
let r = _kshiftri_mask64::<3>(a);
let e: __mmask64 = 0b0110100101101001_0110100101101001;
let e: __mmask64 = 0b1010100101101001_0110100101101001;
assert_eq!(r, e);
let r = _kshiftri_mask64::<34>(a);
let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000001;
assert_eq!(r, e);
let r = _kshiftri_mask64::<35>(a);
let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftri_mask64::<64>(a);
let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
assert_eq!(r, e);
let r = _kshiftri_mask64::<65>(a);
let e: __mmask64 = 0b0000000000000000_0000000000000000_0000000000000000_0000000000000000;
assert_eq!(r, e);
}

View file

@ -4602,7 +4602,7 @@ pub fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
a << COUNT
a.unbounded_shl(COUNT)
}
/// Shift 8-bit mask a right by count bits while shifting in zeros, and store the result in dst.
@ -4613,7 +4613,7 @@ pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
a >> COUNT
a.unbounded_shr(COUNT)
}
/// Compute the bitwise AND of 16-bit masks a and b, and if the result is all zeros, store 1 in dst,
@ -9856,13 +9856,37 @@ mod tests {
let r = _kshiftli_mask8::<3>(a);
let e: __mmask8 = 0b01001000;
assert_eq!(r, e);
let r = _kshiftli_mask8::<7>(a);
let e: __mmask8 = 0b10000000;
assert_eq!(r, e);
let r = _kshiftli_mask8::<8>(a);
let e: __mmask8 = 0b00000000;
assert_eq!(r, e);
let r = _kshiftli_mask8::<9>(a);
let e: __mmask8 = 0b00000000;
assert_eq!(r, e);
}
#[simd_test(enable = "avx512dq")]
unsafe fn test_kshiftri_mask8() {
let a: __mmask8 = 0b01101001;
let a: __mmask8 = 0b10101001;
let r = _kshiftri_mask8::<3>(a);
let e: __mmask8 = 0b00001101;
let e: __mmask8 = 0b00010101;
assert_eq!(r, e);
let r = _kshiftri_mask8::<7>(a);
let e: __mmask8 = 0b00000001;
assert_eq!(r, e);
let r = _kshiftri_mask8::<8>(a);
let e: __mmask8 = 0b00000000;
assert_eq!(r, e);
let r = _kshiftri_mask8::<9>(a);
let e: __mmask8 = 0b00000000;
assert_eq!(r, e);
}

View file

@ -19077,12 +19077,8 @@ pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x16();
let r = vprold(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_rolv_epi32(a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19094,12 +19090,8 @@ pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x16();
let r = vprold(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i32x16()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_mask_rolv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19111,12 +19103,8 @@ pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x16();
let r = vprold(a, IMM8);
transmute(simd_select_bitmask(k, r, i32x16::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_maskz_rolv_epi32(k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19128,12 +19116,8 @@ pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m5
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let r = vprold256(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_rolv_epi32(a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19145,12 +19129,8 @@ pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let r = vprold256(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_mask_rolv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19162,12 +19142,8 @@ pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let r = vprold256(a, IMM8);
transmute(simd_select_bitmask(k, r, i32x8::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_maskz_rolv_epi32(k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19179,12 +19155,8 @@ pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let r = vprold128(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm_rolv_epi32(a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19196,12 +19168,8 @@ pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let r = vprold128(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_mask_rolv_epi32(src, k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19213,12 +19181,8 @@ pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let r = vprold128(a, IMM8);
transmute(simd_select_bitmask(k, r, i32x4::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_maskz_rolv_epi32(k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19230,12 +19194,8 @@ pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x16();
let r = vprord(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_rorv_epi32(a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19247,12 +19207,8 @@ pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x16();
let r = vprord(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i32x16()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_mask_rorv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19264,12 +19220,8 @@ pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x16();
let r = vprord(a, IMM8);
transmute(simd_select_bitmask(k, r, i32x16::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_maskz_rorv_epi32(k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19281,12 +19233,8 @@ pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m5
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let r = vprord256(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_rorv_epi32(a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19298,12 +19246,8 @@ pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let r = vprord256(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_mask_rorv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19315,12 +19259,8 @@ pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let r = vprord256(a, IMM8);
transmute(simd_select_bitmask(k, r, i32x8::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_maskz_rorv_epi32(k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19332,12 +19272,8 @@ pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let r = vprord128(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm_rorv_epi32(a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19349,12 +19285,8 @@ pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let r = vprord128(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_mask_rorv_epi32(src, k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19366,12 +19298,8 @@ pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let r = vprord128(a, IMM8);
transmute(simd_select_bitmask(k, r, i32x4::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_maskz_rorv_epi32(k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19383,12 +19311,8 @@ pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x8();
let r = vprolq(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_rolv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19400,12 +19324,8 @@ pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x8();
let r = vprolq(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i64x8()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_mask_rolv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19417,12 +19337,8 @@ pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m5
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x8();
let r = vprolq(a, IMM8);
transmute(simd_select_bitmask(k, r, i64x8::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_maskz_rolv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19434,12 +19350,8 @@ pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m51
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x4();
let r = vprolq256(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_rolv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19451,12 +19363,8 @@ pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x4();
let r = vprolq256(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_mask_rolv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19468,12 +19376,8 @@ pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x4();
let r = vprolq256(a, IMM8);
transmute(simd_select_bitmask(k, r, i64x4::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_maskz_rolv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@ -19485,12 +19389,8 @@ pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x2();
let r = vprolq128(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm_rolv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19502,12 +19402,8 @@ pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x2();
let r = vprolq128(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_mask_rolv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19519,12 +19415,8 @@ pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x2();
let r = vprolq128(a, IMM8);
transmute(simd_select_bitmask(k, r, i64x2::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_maskz_rolv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19536,12 +19428,8 @@ pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x8();
let r = vprorq(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_rorv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19553,12 +19441,8 @@ pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x8();
let r = vprorq(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i64x8()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_mask_rorv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19570,12 +19454,8 @@ pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m5
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x8();
let r = vprorq(a, IMM8);
transmute(simd_select_bitmask(k, r, i64x8::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm512_maskz_rorv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19587,12 +19467,8 @@ pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m51
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x4();
let r = vprorq256(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_rorv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19604,12 +19480,8 @@ pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x4();
let r = vprorq256(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_mask_rorv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19621,12 +19493,8 @@ pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m2
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x4();
let r = vprorq256(a, IMM8);
transmute(simd_select_bitmask(k, r, i64x4::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm256_maskz_rorv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@ -19638,12 +19506,8 @@ pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m25
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x2();
let r = vprorq128(a, IMM8);
transmute(r)
}
static_assert_uimm_bits!(IMM8, 8);
_mm_rorv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -19655,12 +19519,8 @@ pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x2();
let r = vprorq128(a, IMM8);
transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_mask_rorv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -19672,12 +19532,8 @@ pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
unsafe {
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i64x2();
let r = vprorq128(a, IMM8);
transmute(simd_select_bitmask(k, r, i64x2::ZERO))
}
static_assert_uimm_bits!(IMM8, 8);
_mm_maskz_rorv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@ -21296,7 +21152,13 @@ pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
unsafe {
transmute(simd_funnel_shl(
a.as_u32x16(),
a.as_u32x16(),
simd_and(b.as_u32x16(), u32x16::splat(31)),
))
}
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21335,7 +21197,13 @@ pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
unsafe {
transmute(simd_funnel_shl(
a.as_u32x8(),
a.as_u32x8(),
simd_and(b.as_u32x8(), u32x8::splat(31)),
))
}
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21374,7 +21242,13 @@ pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
unsafe {
transmute(simd_funnel_shl(
a.as_u32x4(),
a.as_u32x4(),
simd_and(b.as_u32x4(), u32x4::splat(31)),
))
}
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21413,7 +21287,13 @@ pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
unsafe {
transmute(simd_funnel_shr(
a.as_u32x16(),
a.as_u32x16(),
simd_and(b.as_u32x16(), u32x16::splat(31)),
))
}
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21452,7 +21332,13 @@ pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
unsafe {
transmute(simd_funnel_shr(
a.as_u32x8(),
a.as_u32x8(),
simd_and(b.as_u32x8(), u32x8::splat(31)),
))
}
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21491,7 +21377,13 @@ pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
unsafe {
transmute(simd_funnel_shr(
a.as_u32x4(),
a.as_u32x4(),
simd_and(b.as_u32x4(), u32x4::splat(31)),
))
}
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21530,7 +21422,13 @@ pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
unsafe {
transmute(simd_funnel_shl(
a.as_u64x8(),
a.as_u64x8(),
simd_and(b.as_u64x8(), u64x8::splat(63)),
))
}
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21569,7 +21467,13 @@ pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
unsafe {
transmute(simd_funnel_shl(
a.as_u64x4(),
a.as_u64x4(),
simd_and(b.as_u64x4(), u64x4::splat(63)),
))
}
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21608,7 +21512,13 @@ pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
unsafe {
transmute(simd_funnel_shl(
a.as_u64x2(),
a.as_u64x2(),
simd_and(b.as_u64x2(), u64x2::splat(63)),
))
}
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21647,7 +21557,13 @@ pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
unsafe {
transmute(simd_funnel_shr(
a.as_u64x8(),
a.as_u64x8(),
simd_and(b.as_u64x8(), u64x8::splat(63)),
))
}
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21686,7 +21602,13 @@ pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
unsafe {
transmute(simd_funnel_shr(
a.as_u64x4(),
a.as_u64x4(),
simd_and(b.as_u64x4(), u64x4::splat(63)),
))
}
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -21725,7 +21647,13 @@ pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
unsafe {
transmute(simd_funnel_shr(
a.as_u64x2(),
a.as_u64x2(),
simd_and(b.as_u64x2(), u64x2::splat(63)),
))
}
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -29234,7 +29162,7 @@ pub fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
a << COUNT
a.unbounded_shl(COUNT)
}
/// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst.
@ -29245,7 +29173,7 @@ pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
#[rustc_legacy_const_generics(1)]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
pub fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
a >> COUNT
a.unbounded_shr(COUNT)
}
/// Load 16-bit mask from memory
@ -29665,6 +29593,7 @@ pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask
#[cfg_attr(test, assert_instr(vmovntps))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntps", ",{a}"),
p = in(reg) mem_addr,
@ -29691,6 +29620,7 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
#[cfg_attr(test, assert_instr(vmovntpd))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntpd", ",{a}"),
p = in(reg) mem_addr,
@ -29717,6 +29647,7 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
#[cfg_attr(test, assert_instr(vmovntdq))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntdq", ",{a}"),
p = in(reg) mem_addr,
@ -42902,62 +42833,6 @@ unsafe extern "C" {
#[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
#[link_name = "llvm.x86.avx512.mask.prol.d.512"]
fn vprold(a: i32x16, i8: i32) -> i32x16;
#[link_name = "llvm.x86.avx512.mask.prol.d.256"]
fn vprold256(a: i32x8, i8: i32) -> i32x8;
#[link_name = "llvm.x86.avx512.mask.prol.d.128"]
fn vprold128(a: i32x4, i8: i32) -> i32x4;
#[link_name = "llvm.x86.avx512.mask.pror.d.512"]
fn vprord(a: i32x16, i8: i32) -> i32x16;
#[link_name = "llvm.x86.avx512.mask.pror.d.256"]
fn vprord256(a: i32x8, i8: i32) -> i32x8;
#[link_name = "llvm.x86.avx512.mask.pror.d.128"]
fn vprord128(a: i32x4, i8: i32) -> i32x4;
#[link_name = "llvm.x86.avx512.mask.prol.q.512"]
fn vprolq(a: i64x8, i8: i32) -> i64x8;
#[link_name = "llvm.x86.avx512.mask.prol.q.256"]
fn vprolq256(a: i64x4, i8: i32) -> i64x4;
#[link_name = "llvm.x86.avx512.mask.prol.q.128"]
fn vprolq128(a: i64x2, i8: i32) -> i64x2;
#[link_name = "llvm.x86.avx512.mask.pror.q.512"]
fn vprorq(a: i64x8, i8: i32) -> i64x8;
#[link_name = "llvm.x86.avx512.mask.pror.q.256"]
fn vprorq256(a: i64x4, i8: i32) -> i64x4;
#[link_name = "llvm.x86.avx512.mask.pror.q.128"]
fn vprorq128(a: i64x2, i8: i32) -> i64x2;
#[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
#[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
#[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
#[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
#[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
#[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
#[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
#[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
#[link_name = "llvm.x86.avx512.psllv.d.512"]
fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
#[link_name = "llvm.x86.avx512.psrlv.d.512"]
@ -56220,13 +56095,37 @@ mod tests {
let r = _kshiftli_mask16::<3>(a);
let e: __mmask16 = 0b1011011000011000;
assert_eq!(r, e);
let r = _kshiftli_mask16::<15>(a);
let e: __mmask16 = 0b1000000000000000;
assert_eq!(r, e);
let r = _kshiftli_mask16::<16>(a);
let e: __mmask16 = 0b0000000000000000;
assert_eq!(r, e);
let r = _kshiftli_mask16::<17>(a);
let e: __mmask16 = 0b0000000000000000;
assert_eq!(r, e);
}
#[simd_test(enable = "avx512dq")]
unsafe fn test_kshiftri_mask16() {
let a: __mmask16 = 0b0110100100111100;
let a: __mmask16 = 0b1010100100111100;
let r = _kshiftri_mask16::<3>(a);
let e: __mmask16 = 0b0000110100100111;
let e: __mmask16 = 0b0001010100100111;
assert_eq!(r, e);
let r = _kshiftri_mask16::<15>(a);
let e: __mmask16 = 0b0000000000000001;
assert_eq!(r, e);
let r = _kshiftri_mask16::<16>(a);
let e: __mmask16 = 0b0000000000000000;
assert_eq!(r, e);
let r = _kshiftri_mask16::<17>(a);
let e: __mmask16 = 0b0000000000000000;
assert_eq!(r, e);
}
@ -56432,6 +56331,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 16] };
_mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
_mm_sfence();
for i in 0..16 {
assert_eq!(mem.data[i], get_m512(a, i));
}
@ -56448,6 +56348,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 8] };
_mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
_mm_sfence();
for i in 0..8 {
assert_eq!(mem.data[i], get_m512d(a, i));
}
@ -56464,6 +56365,7 @@ mod tests {
let mut mem = Memory { data: [-1; 8] };
_mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
_mm_sfence();
for i in 0..8 {
assert_eq!(mem.data[i], get_m512i(a, i));
}

View file

@ -1615,7 +1615,7 @@ pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
#[cfg_attr(test, assert_instr(vaddsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
_mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
}
/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -1628,7 +1628,16 @@ pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
#[cfg_attr(test, assert_instr(vaddsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
unsafe {
let extractsrc: f16 = simd_extract!(src, 0);
let mut add: f16 = extractsrc;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta + extractb;
}
simd_insert!(a, 0, add)
}
}
/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -1641,7 +1650,15 @@ pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
#[cfg_attr(test, assert_instr(vaddsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
unsafe {
let mut add: f16 = 0.;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta + extractb;
}
simd_insert!(a, 0, add)
}
}
/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@ -1927,7 +1944,7 @@ pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
#[cfg_attr(test, assert_instr(vsubsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
_mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
}
/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@ -1940,7 +1957,16 @@ pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
#[cfg_attr(test, assert_instr(vsubsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
unsafe {
let extractsrc: f16 = simd_extract!(src, 0);
let mut add: f16 = extractsrc;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta - extractb;
}
simd_insert!(a, 0, add)
}
}
/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@ -1953,7 +1979,15 @@ pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
#[cfg_attr(test, assert_instr(vsubsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
unsafe {
let mut add: f16 = 0.;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta - extractb;
}
simd_insert!(a, 0, add)
}
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@ -2239,7 +2273,7 @@ pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
#[cfg_attr(test, assert_instr(vmulsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
_mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
}
/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -2252,7 +2286,16 @@ pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
#[cfg_attr(test, assert_instr(vmulsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
unsafe {
let extractsrc: f16 = simd_extract!(src, 0);
let mut add: f16 = extractsrc;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta * extractb;
}
simd_insert!(a, 0, add)
}
}
/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@ -2265,7 +2308,15 @@ pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
#[cfg_attr(test, assert_instr(vmulsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
unsafe {
let mut add: f16 = 0.;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta * extractb;
}
simd_insert!(a, 0, add)
}
}
/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@ -2551,7 +2602,7 @@ pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
#[cfg_attr(test, assert_instr(vdivsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
_mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
}
/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@ -2564,7 +2615,16 @@ pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
#[cfg_attr(test, assert_instr(vdivsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
unsafe {
let extractsrc: f16 = simd_extract!(src, 0);
let mut add: f16 = extractsrc;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta / extractb;
}
simd_insert!(a, 0, add)
}
}
/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@ -2577,7 +2637,15 @@ pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
#[cfg_attr(test, assert_instr(vdivsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
_mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
unsafe {
let mut add: f16 = 0.;
if (k & 0b00000001) != 0 {
let extracta: f16 = simd_extract!(a, 0);
let extractb: f16 = simd_extract!(b, 0);
add = extracta / extractb;
}
simd_insert!(a, 0, add)
}
}
/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
@ -7116,7 +7184,11 @@ pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
#[cfg_attr(test, assert_instr(vfmaddsub))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
unsafe { vfmaddsubph_128(a, b, c) }
unsafe {
let add = simd_fma(a, b, c);
let sub = simd_fma(a, b, simd_neg(c));
simd_shuffle!(sub, add, [0, 9, 2, 11, 4, 13, 6, 15])
}
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@ -7167,7 +7239,15 @@ pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) ->
#[cfg_attr(test, assert_instr(vfmaddsub))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
unsafe { vfmaddsubph_256(a, b, c) }
unsafe {
let add = simd_fma(a, b, c);
let sub = simd_fma(a, b, simd_neg(c));
simd_shuffle!(
sub,
add,
[0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
)
}
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@ -7218,7 +7298,18 @@ pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h
#[cfg_attr(test, assert_instr(vfmaddsub))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
_mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
unsafe {
let add = simd_fma(a, b, c);
let sub = simd_fma(a, b, simd_neg(c));
simd_shuffle!(
sub,
add,
[
0, 33, 2, 35, 4, 37, 6, 39, 8, 41, 10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 20, 53,
22, 55, 24, 57, 26, 59, 28, 61, 30, 63
]
)
}
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@ -7391,7 +7482,7 @@ pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
#[cfg_attr(test, assert_instr(vfmsubadd))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
_mm_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@ -7442,7 +7533,7 @@ pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) ->
#[cfg_attr(test, assert_instr(vfmsubadd))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
_mm256_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@ -7493,7 +7584,7 @@ pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h
#[cfg_attr(test, assert_instr(vfmsubadd))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
_mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
_mm512_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@ -11111,7 +11202,7 @@ pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
#[inline]
#[target_feature(enable = "avx512fp16")]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
pub fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
unsafe {
let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
let q = simd_shuffle!(
@ -16341,10 +16432,6 @@ unsafe extern "C" {
#[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
#[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
#[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
#[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;

View file

@ -500,7 +500,13 @@ pub fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvq))]
pub fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
unsafe { transmute(simd_funnel_shl(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i64x8(),
b.as_i64x8(),
simd_and(c.as_i64x8(), i64x8::splat(63)),
))
}
}
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -539,7 +545,13 @@ pub fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i)
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvq))]
pub fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
unsafe { transmute(simd_funnel_shl(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i64x4(),
b.as_i64x4(),
simd_and(c.as_i64x4(), i64x4::splat(63)),
))
}
}
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -578,7 +590,13 @@ pub fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvq))]
pub fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { transmute(simd_funnel_shl(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i64x2(),
b.as_i64x2(),
simd_and(c.as_i64x2(), i64x2::splat(63)),
))
}
}
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -617,7 +635,13 @@ pub fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvd))]
pub fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
unsafe { transmute(simd_funnel_shl(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i32x16(),
b.as_i32x16(),
simd_and(c.as_i32x16(), i32x16::splat(31)),
))
}
}
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -656,7 +680,13 @@ pub fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvd))]
pub fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
unsafe { transmute(simd_funnel_shl(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i32x8(),
b.as_i32x8(),
simd_and(c.as_i32x8(), i32x8::splat(31)),
))
}
}
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -695,7 +725,13 @@ pub fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvd))]
pub fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { transmute(simd_funnel_shl(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i32x4(),
b.as_i32x4(),
simd_and(c.as_i32x4(), i32x4::splat(31)),
))
}
}
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -734,7 +770,13 @@ pub fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvw))]
pub fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
unsafe { transmute(simd_funnel_shl(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i16x32(),
b.as_i16x32(),
simd_and(c.as_i16x32(), i16x32::splat(15)),
))
}
}
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -773,7 +815,13 @@ pub fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvw))]
pub fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
unsafe { transmute(simd_funnel_shl(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i16x16(),
b.as_i16x16(),
simd_and(c.as_i16x16(), i16x16::splat(15)),
))
}
}
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -812,7 +860,13 @@ pub fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshldvw))]
pub fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { transmute(simd_funnel_shl(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) }
unsafe {
transmute(simd_funnel_shl(
a.as_i16x8(),
b.as_i16x8(),
simd_and(c.as_i16x8(), i16x8::splat(15)),
))
}
}
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -851,7 +905,13 @@ pub fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvq))]
pub fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
unsafe { transmute(simd_funnel_shr(b.as_i64x8(), a.as_i64x8(), c.as_i64x8())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i64x8(),
a.as_i64x8(),
simd_and(c.as_i64x8(), i64x8::splat(63)),
))
}
}
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -890,7 +950,13 @@ pub fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i)
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvq))]
pub fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
unsafe { transmute(simd_funnel_shr(b.as_i64x4(), a.as_i64x4(), c.as_i64x4())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i64x4(),
a.as_i64x4(),
simd_and(c.as_i64x4(), i64x4::splat(63)),
))
}
}
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -929,7 +995,13 @@ pub fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvq))]
pub fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { transmute(simd_funnel_shr(b.as_i64x2(), a.as_i64x2(), c.as_i64x2())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i64x2(),
a.as_i64x2(),
simd_and(c.as_i64x2(), i64x2::splat(63)),
))
}
}
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -968,7 +1040,13 @@ pub fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvd))]
pub fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
unsafe { transmute(simd_funnel_shr(b.as_i32x16(), a.as_i32x16(), c.as_i32x16())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i32x16(),
a.as_i32x16(),
simd_and(c.as_i32x16(), i32x16::splat(31)),
))
}
}
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1007,7 +1085,13 @@ pub fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvd))]
pub fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
unsafe { transmute(simd_funnel_shr(b.as_i32x8(), a.as_i32x8(), c.as_i32x8())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i32x8(),
a.as_i32x8(),
simd_and(c.as_i32x8(), i32x8::splat(31)),
))
}
}
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1046,7 +1130,13 @@ pub fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i)
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvd))]
pub fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { transmute(simd_funnel_shr(b.as_i32x4(), a.as_i32x4(), c.as_i32x4())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i32x4(),
a.as_i32x4(),
simd_and(c.as_i32x4(), i32x4::splat(31)),
))
}
}
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1085,7 +1175,13 @@ pub fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) ->
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvw))]
pub fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
unsafe { transmute(simd_funnel_shr(b.as_i16x32(), a.as_i16x32(), c.as_i16x32())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i16x32(),
a.as_i16x32(),
simd_and(c.as_i16x32(), i16x32::splat(15)),
))
}
}
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1124,7 +1220,13 @@ pub fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvw))]
pub fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
unsafe { transmute(simd_funnel_shr(b.as_i16x16(), a.as_i16x16(), c.as_i16x16())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i16x16(),
a.as_i16x16(),
simd_and(c.as_i16x16(), i16x16::splat(15)),
))
}
}
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -1163,7 +1265,13 @@ pub fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpshrdvw))]
pub fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
unsafe { transmute(simd_funnel_shr(b.as_i16x8(), a.as_i16x8(), c.as_i16x8())) }
unsafe {
transmute(simd_funnel_shr(
b.as_i16x8(),
a.as_i16x8(),
simd_and(c.as_i16x8(), i16x8::splat(15)),
))
}
}
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).

View file

@ -10,7 +10,7 @@ use stdarch_test::assert_instr;
#[inline]
#[cfg_attr(test, assert_instr(bswap))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _bswap(x: i32) -> i32 {
pub fn _bswap(x: i32) -> i32 {
x.swap_bytes()
}
@ -20,9 +20,7 @@ mod tests {
#[test]
fn test_bswap() {
unsafe {
assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
assert_eq!(_bswap(0x00000000), 0x00000000);
}
assert_eq!(_bswap(0x0EADBE0F), 0x0FBEAD0E);
assert_eq!(_bswap(0x00000000), 0x00000000);
}
}

View file

@ -3,16 +3,13 @@
//! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769
use crate::core_arch::{simd::*, x86::*};
use crate::intrinsics::simd::*;
#[cfg(test)]
use stdarch_test::assert_instr;
#[allow(improper_ctypes)]
unsafe extern "unadjusted" {
#[link_name = "llvm.x86.vcvtph2ps.128"]
fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
#[link_name = "llvm.x86.vcvtph2ps.256"]
fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
#[link_name = "llvm.x86.vcvtps2ph.128"]
fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
#[link_name = "llvm.x86.vcvtps2ph.256"]
@ -29,7 +26,11 @@ unsafe extern "unadjusted" {
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
unsafe {
let a: f16x8 = transmute(a);
let a: f16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
simd_cast(a)
}
}
/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
@ -41,7 +42,10 @@ pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
unsafe {
let a: f16x8 = transmute(a);
simd_cast(a)
}
}
/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x

View file

@ -745,7 +745,6 @@ mod tests {
#![allow(overflowing_literals)]
use core::hint::black_box;
use core::intrinsics::size_of;
use stdarch_test::simd_test;
use crate::core_arch::x86::*;
@ -881,26 +880,20 @@ mod tests {
}
#[target_feature(enable = "sse2")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
let byte_offset = word_index * 16 / size_of::<T>();
let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
let pointer = data.as_ptr().byte_add(word_index * 16) as *const __m128i;
_mm_loadu_si128(black_box(pointer))
}
#[target_feature(enable = "avx")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
let byte_offset = word_index * 32 / size_of::<T>();
let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
let pointer = data.as_ptr().byte_add(word_index * 32) as *const __m256i;
_mm256_loadu_si256(black_box(pointer))
}
#[target_feature(enable = "avx512f")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
let byte_offset = word_index * 64 / size_of::<T>();
let pointer = data.as_ptr().add(byte_offset) as *const _;
let pointer = data.as_ptr().byte_add(word_index * 64) as *const __m512i;
_mm512_loadu_si512(black_box(pointer))
}

View file

@ -26,8 +26,8 @@ use stdarch_test::assert_instr;
#[target_feature(enable = "rdrand")]
#[cfg_attr(test, assert_instr(rdrand))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
let (v, flag) = x86_rdrand16_step();
pub fn _rdrand16_step(val: &mut u16) -> i32 {
let (v, flag) = unsafe { x86_rdrand16_step() };
*val = v;
flag
}
@ -40,8 +40,8 @@ pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
#[target_feature(enable = "rdrand")]
#[cfg_attr(test, assert_instr(rdrand))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
let (v, flag) = x86_rdrand32_step();
pub fn _rdrand32_step(val: &mut u32) -> i32 {
let (v, flag) = unsafe { x86_rdrand32_step() };
*val = v;
flag
}
@ -54,8 +54,8 @@ pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
#[target_feature(enable = "rdseed")]
#[cfg_attr(test, assert_instr(rdseed))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
let (v, flag) = x86_rdseed16_step();
pub fn _rdseed16_step(val: &mut u16) -> i32 {
let (v, flag) = unsafe { x86_rdseed16_step() };
*val = v;
flag
}
@ -68,8 +68,8 @@ pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
#[target_feature(enable = "rdseed")]
#[cfg_attr(test, assert_instr(rdseed))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
let (v, flag) = x86_rdseed32_step();
pub fn _rdseed32_step(val: &mut u32) -> i32 {
let (v, flag) = unsafe { x86_rdseed32_step() };
*val = v;
flag
}

View file

@ -882,7 +882,7 @@ pub fn _mm_cvtss_f32(a: __m128) -> f32 {
#[cfg_attr(test, assert_instr(cvtsi2ss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
unsafe { cvtsi2ss(a, b) }
unsafe { simd_insert!(a, 0, b as f32) }
}
/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
@ -1445,8 +1445,8 @@ pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
#[target_feature(enable = "sse")]
#[cfg_attr(test, assert_instr(sfence))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_sfence() {
sfence()
pub fn _mm_sfence() {
unsafe { sfence() }
}
/// Gets the unsigned 32-bit value of the MXCSR control and status register.
@ -1887,6 +1887,8 @@ pub const _MM_HINT_ET1: i32 = 6;
/// * Prefetching may also fail if there are not enough memory-subsystem
/// resources (e.g., request buffers).
///
/// Note: this intrinsic is safe to use even though it takes a raw pointer argument. In general, this
/// cannot change the behavior of the program, including not trapping on invalid pointers.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
#[inline]
@ -1897,11 +1899,13 @@ pub const _MM_HINT_ET1: i32 = 6;
#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
pub fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
static_assert_uimm_bits!(STRATEGY, 3);
// We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
// `locality` and `rw` are based on our `STRATEGY`.
prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
unsafe {
prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
}
}
/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
@ -1989,8 +1993,6 @@ unsafe extern "C" {
fn cvtss2si(a: __m128) -> i32;
#[link_name = "llvm.x86.sse.cvttss2si"]
fn cvttss2si(a: __m128) -> i32;
#[link_name = "llvm.x86.sse.cvtsi2ss"]
fn cvtsi2ss(a: __m128, b: i32) -> __m128;
#[link_name = "llvm.x86.sse.sfence"]
fn sfence();
#[link_name = "llvm.x86.sse.stmxcsr"]
@ -2024,6 +2026,7 @@ unsafe extern "C" {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntps", ",{a}"),
p = in(reg) mem_addr,
@ -3331,6 +3334,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 4] };
_mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..4 {
assert_eq!(mem.data[i], get_m128(a, i));
}

View file

@ -19,10 +19,10 @@ use crate::{
#[inline]
#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_pause() {
pub fn _mm_pause() {
// note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
// the SSE2 target-feature - therefore it does not require any target features
pause()
unsafe { pause() }
}
/// Invalidates and flushes the cache line that contains `p` from all levels of
@ -49,8 +49,8 @@ pub unsafe fn _mm_clflush(p: *const u8) {
#[target_feature(enable = "sse2")]
#[cfg_attr(test, assert_instr(lfence))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_lfence() {
lfence()
pub fn _mm_lfence() {
unsafe { lfence() }
}
/// Performs a serializing operation on all load-from-memory and store-to-memory
@ -65,8 +65,8 @@ pub unsafe fn _mm_lfence() {
#[target_feature(enable = "sse2")]
#[cfg_attr(test, assert_instr(mfence))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mfence() {
mfence()
pub fn _mm_mfence() {
unsafe { mfence() }
}
/// Adds packed 8-bit integers in `a` and `b`.
@ -201,7 +201,12 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmaddwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
unsafe {
let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
simd_add(even, odd).as_m128i()
}
}
/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@ -1358,6 +1363,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movntdq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntdq", ",{a}"),
p = in(reg) mem_addr,
@ -1385,6 +1391,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movnti", ",{a:e}"), // `:e` for 32bit value
p = in(reg) mem_addr,
@ -2417,7 +2424,10 @@ pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
#[cfg_attr(test, assert_instr(cvtss2sd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
unsafe { cvtss2sd(a, b) }
unsafe {
let elt: f32 = simd_extract!(b, 0);
simd_insert!(a, 0, elt as f64)
}
}
/// Converts packed double-precision (64-bit) floating-point elements in `a` to
@ -2619,6 +2629,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntpd", ",{a}"),
p = in(reg) mem_addr,
@ -3043,8 +3054,6 @@ unsafe extern "C" {
fn lfence();
#[link_name = "llvm.x86.sse2.mfence"]
fn mfence();
#[link_name = "llvm.x86.sse2.pmadd.wd"]
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
#[link_name = "llvm.x86.sse2.psad.bw"]
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
#[link_name = "llvm.x86.sse2.psll.w"]
@ -3115,8 +3124,6 @@ unsafe extern "C" {
fn cvtsd2si(a: __m128d) -> i32;
#[link_name = "llvm.x86.sse2.cvtsd2ss"]
fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
#[link_name = "llvm.x86.sse2.cvtss2sd"]
fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
#[link_name = "llvm.x86.sse2.cvttpd2dq"]
fn cvttpd2dq(a: __m128d) -> i32x4;
#[link_name = "llvm.x86.sse2.cvttsd2si"]
@ -3142,7 +3149,7 @@ mod tests {
#[test]
fn test_mm_pause() {
unsafe { _mm_pause() }
_mm_pause()
}
#[simd_test(enable = "sse2")]
@ -4066,6 +4073,7 @@ mod tests {
);
let mut r = _mm_set1_epi8(0);
_mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
_mm_sfence();
let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
assert_eq_m128i(r, e);
}
@ -4102,6 +4110,7 @@ mod tests {
let a = _mm_setr_epi32(1, 2, 3, 4);
let mut r = _mm_undefined_si128();
_mm_stream_si128(ptr::addr_of_mut!(r), a);
_mm_sfence();
assert_eq_m128i(r, a);
}
@ -4113,6 +4122,7 @@ mod tests {
let a: i32 = 7;
let mut mem = boxed::Box::<i32>::new(-1);
_mm_stream_si32(ptr::addr_of_mut!(*mem), a);
_mm_sfence();
assert_eq!(a, *mem);
}
@ -4809,6 +4819,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 2] };
_mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..2 {
assert_eq!(mem.data[i], get_m128d(a, i));
}

View file

@ -51,7 +51,11 @@ pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(haddpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
unsafe { haddpd(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 2]);
let odd = simd_shuffle!(a, b, [1, 3]);
simd_add(even, odd)
}
}
/// Horizontally adds adjacent pairs of single-precision (32-bit)
@ -63,7 +67,11 @@ pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(haddps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
unsafe { haddps(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
simd_add(even, odd)
}
}
/// Horizontally subtract adjacent pairs of double-precision (64-bit)
@ -75,7 +83,11 @@ pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(hsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
unsafe { hsubpd(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 2]);
let odd = simd_shuffle!(a, b, [1, 3]);
simd_sub(even, odd)
}
}
/// Horizontally adds adjacent pairs of single-precision (32-bit)
@ -87,7 +99,11 @@ pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(hsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
unsafe { hsubps(a, b) }
unsafe {
let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
simd_sub(even, odd)
}
}
/// Loads 128-bits of integer data from unaligned memory.
@ -153,14 +169,6 @@ pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
#[allow(improper_ctypes)]
unsafe extern "C" {
#[link_name = "llvm.x86.sse3.hadd.pd"]
fn haddpd(a: __m128d, b: __m128d) -> __m128d;
#[link_name = "llvm.x86.sse3.hadd.ps"]
fn haddps(a: __m128, b: __m128) -> __m128;
#[link_name = "llvm.x86.sse3.hsub.pd"]
fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
#[link_name = "llvm.x86.sse3.hsub.ps"]
fn hsubps(a: __m128, b: __m128) -> __m128;
#[link_name = "llvm.x86.sse3.ldu.dq"]
fn lddqu(mem_addr: *const i8) -> i8x16;
}

View file

@ -1006,7 +1006,10 @@ pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(ptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
unsafe {
let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
(0i64 == r) as i32
}
}
/// Tests whether the specified bits in a 128-bit integer vector are all
@ -1029,7 +1032,13 @@ pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
#[cfg_attr(test, assert_instr(ptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
unsafe {
let r = simd_reduce_or(simd_and(
simd_xor(a.as_i64x2(), i64x2::splat(!0)),
mask.as_i64x2(),
));
(0i64 == r) as i32
}
}
/// Tests whether the specified bits in a 128-bit integer vector are
@ -1165,10 +1174,6 @@ unsafe extern "C" {
fn phminposuw(a: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse41.mpsadbw"]
fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
#[link_name = "llvm.x86.sse41.ptestz"]
fn ptestz(a: i64x2, mask: i64x2) -> i32;
#[link_name = "llvm.x86.sse41.ptestc"]
fn ptestc(a: i64x2, mask: i64x2) -> i32;
#[link_name = "llvm.x86.sse41.ptestnzc"]
fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
}

View file

@ -15,10 +15,6 @@ unsafe extern "C" {
fn insertq(x: i64x2, y: i64x2) -> i64x2;
#[link_name = "llvm.x86.sse4a.insertqi"]
fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
#[link_name = "llvm.x86.sse4a.movnt.sd"]
fn movntsd(x: *mut f64, y: __m128d);
#[link_name = "llvm.x86.sse4a.movnt.ss"]
fn movntss(x: *mut f32, y: __m128);
}
/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
@ -114,7 +110,13 @@ pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i)
#[cfg_attr(test, assert_instr(movntsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
movntsd(p, a);
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntsd", ",{a}"),
p = in(reg) p,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}
/// Non-temporal store of `a.0` into `p`.
@ -134,7 +136,13 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
#[cfg_attr(test, assert_instr(movntss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
movntss(p, a);
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntss", ",{a}"),
p = in(reg) p,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}
#[cfg(test)]
@ -209,6 +217,7 @@ mod tests {
let x = _mm_setr_pd(3.0, 4.0);
_mm_stream_sd(d, x);
_mm_sfence();
}
assert_eq!(mem.data[0], 3.0);
assert_eq!(mem.data[1], 2.0);
@ -234,6 +243,7 @@ mod tests {
let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
_mm_stream_ss(d, x);
_mm_sfence();
}
assert_eq!(mem.data[0], 5.0);
assert_eq!(mem.data[1], 2.0);

View file

@ -164,7 +164,13 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(phaddw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) }
let a = a.as_i16x8();
let b = b.as_i16x8();
unsafe {
let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
simd_add(even, odd).as_m128i()
}
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
@ -189,7 +195,13 @@ pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(phaddd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) }
let a = a.as_i32x4();
let b = b.as_i32x4();
unsafe {
let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
simd_add(even, odd).as_m128i()
}
}
/// Horizontally subtract the adjacent pairs of values contained in 2
@ -201,7 +213,13 @@ pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(phsubw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) }
let a = a.as_i16x8();
let b = b.as_i16x8();
unsafe {
let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
simd_sub(even, odd).as_m128i()
}
}
/// Horizontally subtract the adjacent pairs of values contained in 2
@ -227,7 +245,13 @@ pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(phsubd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) }
let a = a.as_i32x4();
let b = b.as_i32x4();
unsafe {
let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
simd_sub(even, odd).as_m128i()
}
}
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
@ -305,24 +329,12 @@ unsafe extern "C" {
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.ssse3.phadd.w.128"]
fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.ssse3.phadd.sw.128"]
fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.ssse3.phadd.d.128"]
fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.ssse3.phsub.w.128"]
fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.ssse3.phsub.sw.128"]
fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.ssse3.phsub.d.128"]
fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;

View file

@ -30,7 +30,7 @@ unsafe extern "C" {
#[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86_updates", since = "1.82.0")]
pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
pub fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
static_assert_uimm_bits!(CONTROL, 16);
unsafe { bextri_u32(a, CONTROL) }
}
@ -42,7 +42,7 @@ pub unsafe fn _bextri_u32<const CONTROL: u32>(a: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcfill))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcfill_u32(x: u32) -> u32 {
pub fn _blcfill_u32(x: u32) -> u32 {
x & (x.wrapping_add(1))
}
@ -53,7 +53,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blci))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blci_u32(x: u32) -> u32 {
pub fn _blci_u32(x: u32) -> u32 {
x | !x.wrapping_add(1)
}
@ -64,7 +64,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcic))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcic_u32(x: u32) -> u32 {
pub fn _blcic_u32(x: u32) -> u32 {
!x & x.wrapping_add(1)
}
@ -76,7 +76,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcmsk))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
pub fn _blcmsk_u32(x: u32) -> u32 {
x ^ x.wrapping_add(1)
}
@ -87,7 +87,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcs))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcs_u32(x: u32) -> u32 {
pub fn _blcs_u32(x: u32) -> u32 {
x | x.wrapping_add(1)
}
@ -98,7 +98,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blsfill))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blsfill_u32(x: u32) -> u32 {
pub fn _blsfill_u32(x: u32) -> u32 {
x | x.wrapping_sub(1)
}
@ -109,7 +109,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blsic))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blsic_u32(x: u32) -> u32 {
pub fn _blsic_u32(x: u32) -> u32 {
!x | x.wrapping_sub(1)
}
@ -121,7 +121,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(t1mskc))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
pub fn _t1mskc_u32(x: u32) -> u32 {
!x | x.wrapping_add(1)
}
@ -133,7 +133,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(tzmsk))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
pub fn _tzmsk_u32(x: u32) -> u32 {
!x & x.wrapping_sub(1)
}

View file

@ -159,29 +159,39 @@ pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
}
#[cfg(test)]
pub(crate) use tests::XsaveArea;
#[cfg(test)]
mod tests {
use std::{fmt, prelude::v1::*};
use std::boxed::Box;
use crate::core_arch::x86::*;
use stdarch_test::simd_test;
#[repr(align(64))]
#[derive(Debug)]
struct XsaveArea {
// max size for 256-bit registers is 800 bytes:
// see https://software.intel.com/en-us/node/682996
// max size for 512-bit registers is 2560 bytes:
// FIXME: add source
data: [u8; 2560],
pub(crate) struct XsaveArea {
data: Box<[AlignedArray]>,
}
#[repr(align(64))]
#[derive(Copy, Clone, Debug)]
struct AlignedArray([u8; 64]);
impl XsaveArea {
fn new() -> XsaveArea {
XsaveArea { data: [0; 2560] }
#[target_feature(enable = "xsave")]
pub(crate) fn new() -> XsaveArea {
// `CPUID.(EAX=0DH,ECX=0):ECX` contains the size required to hold all supported xsave
// components. `EBX` contains the size required to hold all xsave components currently
// enabled in `XCR0`. We are using `ECX` to ensure enough space in all scenarios
let CpuidResult { ecx, .. } = unsafe { __cpuid(0x0d) };
XsaveArea {
data: vec![AlignedArray([0; 64]); ecx.div_ceil(64) as usize].into_boxed_slice(),
}
}
fn ptr(&mut self) -> *mut u8 {
self.data.as_mut_ptr()
pub(crate) fn ptr(&mut self) -> *mut u8 {
self.data.as_mut_ptr().cast()
}
}

View file

@ -5,8 +5,6 @@ use stdarch_test::assert_instr;
unsafe extern "unadjusted" {
#[link_name = "llvm.x86.addcarry.64"]
fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
#[link_name = "llvm.x86.addcarryx.u64"]
fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u64) -> u8;
#[link_name = "llvm.x86.subborrow.64"]
fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
}
@ -19,8 +17,8 @@ unsafe extern "unadjusted" {
#[inline]
#[cfg_attr(test, assert_instr(adc))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
let (a, b) = llvm_addcarry_u64(c_in, a, b);
pub fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
let (a, b) = unsafe { llvm_addcarry_u64(c_in, a, b) };
*out = b;
a
}
@ -34,8 +32,8 @@ pub unsafe fn _addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
#[target_feature(enable = "adx")]
#[cfg_attr(test, assert_instr(adc))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
llvm_addcarryx_u64(c_in, a, b, out as *mut _)
pub fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
_addcarry_u64(c_in, a, b, out)
}
/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`.
@ -46,8 +44,8 @@ pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
#[inline]
#[cfg_attr(test, assert_instr(sbb))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
let (a, b) = llvm_subborrow_u64(c_in, a, b);
pub fn _subborrow_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
let (a, b) = unsafe { llvm_subborrow_u64(c_in, a, b) };
*out = b;
a
}
@ -60,38 +58,6 @@ mod tests {
#[test]
fn test_addcarry_u64() {
unsafe {
let a = u64::MAX;
let mut out = 0;
let r = _addcarry_u64(0, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarry_u64(0, a, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, a);
let r = _addcarry_u64(1, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 1);
let r = _addcarry_u64(1, a, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarry_u64(0, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 7);
let r = _addcarry_u64(1, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 8);
}
}
#[simd_test(enable = "adx")]
unsafe fn test_addcarryx_u64() {
let a = u64::MAX;
let mut out = 0;
@ -120,35 +86,63 @@ mod tests {
assert_eq!(out, 8);
}
#[simd_test(enable = "adx")]
fn test_addcarryx_u64() {
let a = u64::MAX;
let mut out = 0;
let r = _addcarryx_u64(0, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarryx_u64(0, a, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, a);
let r = _addcarryx_u64(1, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 1);
let r = _addcarryx_u64(1, a, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
let r = _addcarryx_u64(0, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 7);
let r = _addcarryx_u64(1, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 8);
}
#[test]
fn test_subborrow_u64() {
unsafe {
let a = u64::MAX;
let mut out = 0;
let a = u64::MAX;
let mut out = 0;
let r = _subborrow_u64(0, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u64(0, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u64(0, 0, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 0);
let r = _subborrow_u64(0, 0, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 0);
let r = _subborrow_u64(1, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a - 1);
let r = _subborrow_u64(1, 0, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a - 1);
let r = _subborrow_u64(1, 0, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u64(1, 0, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, a);
let r = _subborrow_u64(0, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 4);
let r = _subborrow_u64(0, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 4);
let r = _subborrow_u64(1, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 3);
}
let r = _subborrow_u64(1, 7, 3, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 3);
}
}

View file

@ -11,7 +11,7 @@ use stdarch_test::assert_instr;
#[inline]
#[cfg_attr(test, assert_instr(bswap))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _bswap64(x: i64) -> i64 {
pub fn _bswap64(x: i64) -> i64 {
x.swap_bytes()
}
@ -21,9 +21,7 @@ mod tests {
#[test]
fn test_bswap64() {
unsafe {
assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
}
assert_eq!(_bswap64(0x0EADBEEFFADECA0E), 0x0ECADEFAEFBEAD0E);
assert_eq!(_bswap64(0x0000000000000000), 0x0000000000000000);
}
}

View file

@ -23,8 +23,8 @@ use stdarch_test::assert_instr;
#[target_feature(enable = "rdrand")]
#[cfg_attr(test, assert_instr(rdrand))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
let (v, flag) = x86_rdrand64_step();
pub fn _rdrand64_step(val: &mut u64) -> i32 {
let (v, flag) = unsafe { x86_rdrand64_step() };
*val = v;
flag
}
@ -37,8 +37,8 @@ pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
#[target_feature(enable = "rdseed")]
#[cfg_attr(test, assert_instr(rdseed))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
let (v, flag) = x86_rdseed64_step();
pub fn _rdseed64_step(val: &mut u64) -> i32 {
let (v, flag) = unsafe { x86_rdseed64_step() };
*val = v;
flag
}

View file

@ -11,8 +11,6 @@ unsafe extern "C" {
fn cvtss2si64(a: __m128) -> i64;
#[link_name = "llvm.x86.sse.cvttss2si64"]
fn cvttss2si64(a: __m128) -> i64;
#[link_name = "llvm.x86.sse.cvtsi642ss"]
fn cvtsi642ss(a: __m128, b: i64) -> __m128;
}
/// Converts the lowest 32 bit float in the input vector to a 64 bit integer.
@ -65,7 +63,7 @@ pub fn _mm_cvttss_si64(a: __m128) -> i64 {
#[cfg_attr(test, assert_instr(cvtsi2ss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
unsafe { cvtsi642ss(a, b) }
unsafe { simd_insert!(a, 0, b as f32) }
}
#[cfg(test)]

View file

@ -78,6 +78,7 @@ pub fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movnti", ",{a}"),
p = in(reg) mem_addr,
@ -200,6 +201,7 @@ mod tests {
let a: i64 = 7;
let mut mem = boxed::Box::<i64>::new(-1);
_mm_stream_si64(ptr::addr_of_mut!(*mem), a);
_mm_sfence();
assert_eq!(a, *mem);
}

View file

@ -30,7 +30,7 @@ unsafe extern "C" {
#[cfg_attr(test, assert_instr(bextr, CONTROL = 0x0404))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86_updates", since = "1.82.0")]
pub unsafe fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
pub fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
static_assert_uimm_bits!(CONTROL, 16);
unsafe { bextri_u64(a, CONTROL) }
}
@ -42,7 +42,7 @@ pub unsafe fn _bextri_u64<const CONTROL: u64>(a: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcfill))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcfill_u64(x: u64) -> u64 {
pub fn _blcfill_u64(x: u64) -> u64 {
x & x.wrapping_add(1)
}
@ -53,7 +53,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blci))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blci_u64(x: u64) -> u64 {
pub fn _blci_u64(x: u64) -> u64 {
x | !x.wrapping_add(1)
}
@ -64,7 +64,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcic))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcic_u64(x: u64) -> u64 {
pub fn _blcic_u64(x: u64) -> u64 {
!x & x.wrapping_add(1)
}
@ -76,7 +76,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcmsk))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
pub fn _blcmsk_u64(x: u64) -> u64 {
x ^ x.wrapping_add(1)
}
@ -87,7 +87,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blcs))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blcs_u64(x: u64) -> u64 {
pub fn _blcs_u64(x: u64) -> u64 {
x | x.wrapping_add(1)
}
@ -98,7 +98,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blsfill))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blsfill_u64(x: u64) -> u64 {
pub fn _blsfill_u64(x: u64) -> u64 {
x | x.wrapping_sub(1)
}
@ -109,7 +109,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(blsic))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _blsic_u64(x: u64) -> u64 {
pub fn _blsic_u64(x: u64) -> u64 {
!x | x.wrapping_sub(1)
}
@ -121,7 +121,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(t1mskc))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
pub fn _t1mskc_u64(x: u64) -> u64 {
!x | x.wrapping_add(1)
}
@ -133,7 +133,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
#[target_feature(enable = "tbm")]
#[cfg_attr(test, assert_instr(tzmsk))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
pub fn _tzmsk_u64(x: u64) -> u64 {
!x & x.wrapping_sub(1)
}

View file

@ -126,29 +126,10 @@ pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
#[cfg(test)]
mod tests {
use crate::core_arch::x86_64::xsave;
use std::fmt;
use crate::core_arch::x86::*;
use crate::core_arch::x86_64::*;
use stdarch_test::simd_test;
#[repr(align(64))]
#[derive(Debug)]
struct XsaveArea {
// max size for 256-bit registers is 800 bytes:
// see https://software.intel.com/en-us/node/682996
// max size for 512-bit registers is 2560 bytes:
// FIXME: add source
data: [u8; 2560],
}
impl XsaveArea {
fn new() -> XsaveArea {
XsaveArea { data: [0; 2560] }
}
fn ptr(&mut self) -> *mut u8 {
self.data.as_mut_ptr()
}
}
#[simd_test(enable = "xsave")]
#[cfg_attr(miri, ignore)] // Register saving/restoring is not supported in Miri
unsafe fn test_xsave64() {
@ -156,9 +137,9 @@ mod tests {
let mut a = XsaveArea::new();
let mut b = XsaveArea::new();
xsave::_xsave64(a.ptr(), m);
xsave::_xrstor64(a.ptr(), m);
xsave::_xsave64(b.ptr(), m);
_xsave64(a.ptr(), m);
_xrstor64(a.ptr(), m);
_xsave64(b.ptr(), m);
}
#[simd_test(enable = "xsave,xsaveopt")]
@ -168,9 +149,9 @@ mod tests {
let mut a = XsaveArea::new();
let mut b = XsaveArea::new();
xsave::_xsaveopt64(a.ptr(), m);
xsave::_xrstor64(a.ptr(), m);
xsave::_xsaveopt64(b.ptr(), m);
_xsaveopt64(a.ptr(), m);
_xrstor64(a.ptr(), m);
_xsaveopt64(b.ptr(), m);
}
#[simd_test(enable = "xsave,xsavec")]
@ -180,8 +161,8 @@ mod tests {
let mut a = XsaveArea::new();
let mut b = XsaveArea::new();
xsave::_xsavec64(a.ptr(), m);
xsave::_xrstor64(a.ptr(), m);
xsave::_xsavec64(b.ptr(), m);
_xsavec64(a.ptr(), m);
_xrstor64(a.ptr(), m);
_xsavec64(b.ptr(), m);
}
}

View file

@ -19,3 +19,6 @@ pretty_env_logger = "0.5.0"
rayon = "1.5.0"
diff = "0.1.12"
itertools = "0.14.0"
quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] }
serde-xml-rs = "0.8.0"
regex = "1.11.1"

View file

@ -0,0 +1,904 @@
# Are defined under a similar name
#__bswap_64
_bswap64
# Provides pointer to allocated memory, which is difficult to test
_mm_malloc
# requires target feature 'waitpkg', but would be inlined into function that is compiled without support for 'waitpkg'
_tpause
_umwait
# `use of undeclared identifier` error in Clang
_bit_scan_forward
_bit_scan_reverse
_bswap
_castf32_u32
_castf64_u64
_castu32_f32
_castu64_f64
_lrotl
_lrotr
_may_i_use_cpu_feature
_may_i_use_cpu_feature_ext
_mm256_acos_pd
_mm256_acos_ph
_mm256_acos_ps
_mm256_acosh_pd
_mm256_acosh_ph
_mm256_acosh_ps
_mm256_asin_pd
_mm256_asin_ph
_mm256_asin_ps
_mm256_asinh_pd
_mm256_asinh_ph
_mm256_asinh_ps
_mm256_atan_pd
_mm256_atan_ps
_mm256_atan_ph
_mm256_atan2_pd
_mm256_atan2_ph
_mm256_atan2_ps
_mm256_atanh_pd
_mm256_atanh_ph
_mm256_atanh_ps
_mm256_cbrt_pd
_mm256_cbrt_ph
_mm256_cbrt_ps
_mm256_cdfnorm_pd
_mm256_cdfnorm_ph
_mm256_cdfnorm_ps
_mm256_cdfnorminv_pd
_mm256_cdfnorminv_ph
_mm256_cdfnorminv_ps
_mm256_cexp_ps
_mm256_cos_pd
_mm256_cos_ph
_mm256_cos_ps
_mm256_cosd_pd
_mm256_cosd_ph
_mm256_cosd_ps
_mm256_cosh_pd
_mm256_cosh_ph
_mm256_cosh_ps
_mm256_csqrt_ps
_mm256_div_epi16
_mm256_div_epi32
_mm256_div_epi64
_mm256_div_epi8
_mm256_div_epu16
_mm256_div_epu32
_mm256_div_epu64
_mm256_div_epu8
_mm256_dpbssd_epi32
_mm256_dpbssds_epi32
_mm256_dpbsud_epi32
_mm256_dpbsuds_epi32
_mm256_dpbuud_epi32
_mm256_dpbuuds_epi32
_mm256_dpwsud_epi32
_mm256_dpwsuds_epi32
_mm256_dpwusd_epi32
_mm256_dpwusds_epi32
_mm256_dpwuud_epi32
_mm256_dpwuuds_epi32
_mm256_erf_pd
_mm256_erf_ps
_mm256_erfc_pd
_mm256_erfc_ph
_mm256_erfc_ps
_mm256_erfcinv_pd
_mm256_erfcinv_ph
_mm256_erfcinv_ps
_mm256_erfinv_pd
_mm256_erfinv_ph
_mm256_erfinv_ps
_mm256_exp10_pd
_mm256_exp10_ph
_mm256_exp10_ps
_mm256_exp2_pd
_mm256_exp2_ph
_mm256_exp2_ps
_mm256_exp_pd
_mm256_exp_ph
_mm256_exp_ps
_mm256_expm1_pd
_mm256_expm1_ph
_mm256_expm1_ps
_mm256_hypot_pd
_mm256_hypot_ph
_mm256_hypot_ps
_mm256_idiv_epi32
_mm256_invcbrt_pd
_mm256_invcbrt_ph
_mm256_invcbrt_ps
_mm256_invsqrt_pd
_mm256_invsqrt_ph
_mm256_invsqrt_ps
_mm256_irem_epi32
_mm256_log10_pd
_mm256_log10_ph
_mm256_log10_ps
_mm256_log1p_pd
_mm256_log1p_ph
_mm256_log1p_ps
_mm256_log2_pd
_mm256_log2_ph
_mm256_log2_ps
_mm256_log_pd
_mm256_log_ph
_mm256_log_ps
_mm256_logb_pd
_mm256_logb_ph
_mm256_logb_ps
_mm256_clog_ps
_mm256_madd52hi_avx_epu64
_mm256_madd52lo_avx_epu64
_mm256_erf_ph
_mm256_mask_reduce_add_epi16
_mm256_mask_reduce_add_epi8
_mm256_mask_reduce_and_epi16
_mm256_mask_reduce_and_epi8
_mm256_mask_reduce_max_epi16
_mm256_mask_reduce_max_epi8
_mm256_mask_reduce_max_epu16
_mm256_mask_reduce_max_epu8
_mm256_mask_reduce_min_epi16
_mm256_mask_reduce_min_epi8
_mm256_mask_reduce_min_epu16
_mm256_mask_reduce_min_epu8
_mm256_mask_reduce_mul_epi16
_mm256_mask_reduce_mul_epi8
_mm256_mask_reduce_or_epi16
_mm256_mask_reduce_or_epi8
_mm512_cosd_ph
_mm512_cosd_ps
_mm512_cosh_pd
_mm512_cosh_ph
_mm512_cosh_ps
_mm512_div_epi16
_mm512_div_epi32
_mm512_div_epi64
_mm512_div_epi8
_mm512_div_epu16
_mm512_div_epu32
_mm512_div_epu64
_mm512_div_epu8
_mm512_erf_pd
_mm512_erf_ph
_mm512_erf_ps
_mm512_erfc_pd
_mm512_erfc_ph
_mm512_erfc_ps
_mm512_erfcinv_pd
_mm512_erfcinv_ph
_mm512_erfcinv_ps
_mm512_erfinv_pd
_mm512_erfinv_ph
_mm512_erfinv_ps
_mm512_exp10_pd
_mm512_exp10_ph
_mm512_exp10_ps
_mm512_exp2_pd
_mm512_exp2_ph
_mm512_exp2_ps
_mm512_exp_pd
_mm512_exp_ph
_mm512_exp_ps
_mm512_expm1_pd
_mm512_expm1_ph
_mm512_expm1_ps
_mm512_floor_ph
_mm512_hypot_pd
_mm512_hypot_ph
_mm512_hypot_ps
_mm512_invsqrt_pd
_mm512_invsqrt_ph
_mm512_invsqrt_ps
_mm512_log10_pd
_mm512_log10_ph
_mm512_log10_ps
_mm512_log1p_pd
_mm512_log1p_ph
_mm512_log1p_ps
_mm512_log2_pd
_mm512_log2_ph
_mm512_log2_ps
_mm512_log_pd
_mm512_log_ph
_mm512_log_ps
_mm512_logb_pd
_mm512_logb_ph
_mm512_logb_ps
_mm512_mask_acos_pd
_mm512_mask_acos_ph
_mm512_mask_acos_ps
_mm512_mask_acosh_pd
_mm512_mask_acosh_ph
_mm512_mask_acosh_ps
_mm512_mask_asin_pd
_mm512_mask_asin_ph
_mm512_mask_asin_ps
_mm512_mask_asinh_pd
_mm512_mask_asinh_ph
_mm512_mask_asinh_ps
_mm512_mask_atan2_pd
_mm512_mask_atan2_ps
_mm512_mask_atan_pd
_mm512_mask_atan_ph
_mm512_mask_atan_ph
_mm512_mask_atanh_pd
_mm512_mask_atanh_ph
_mm512_mask_atanh_ps
_mm512_mask_cbrt_pd
_mm512_mask_cbrt_ph
_mm512_mask_cbrt_ps
_mm512_mask_cdfnorm_pd
_mm512_mask_cdfnorm_ph
_mm512_mask_cdfnorm_ps
_mm512_mask_cdfnorminv_pd
_mm512_mask_cdfnorminv_ph
_mm512_mask_cdfnorminv_ps
_mm512_mask_ceil_ph
_mm512_mask_cos_pd
_mm512_mask_cos_ph
_mm512_mask_cos_ps
_mm512_mask_cosd_pd
_mm512_mask_cosd_ph
_mm512_mask_cosd_ps
_mm512_mask_cosh_pd
_mm512_mask_cosh_ph
_mm512_mask_cosh_ps
_mm512_mask_atan_ps
_mm512_cosd_pd
_mm512_cos_ps
_mm512_cos_ph
_mm512_cos_pd
_mm512_mask_div_epi32
_mm512_mask_div_epu32
_mm512_mask_erf_pd
_mm512_mask_erf_ph
_mm512_mask_erf_ps
_mm512_mask_erfc_pd
_mm512_mask_erfc_ph
_mm512_mask_erfc_ps
_mm512_mask_erfcinv_pd
_mm512_mask_erfcinv_ph
_mm512_mask_erfcinv_ps
_mm512_mask_erfinv_pd
_mm512_mask_erfinv_ph
_mm512_mask_erfinv_ps
_mm512_mask_exp10_pd
_mm512_mask_exp10_ph
_mm512_mask_exp10_ps
_mm512_mask_exp2_pd
_mm512_mask_exp2_ph
_mm512_mask_exp2_ps
_mm512_mask_exp_pd
_mm512_mask_exp_ph
_mm512_mask_exp_ps
_mm512_mask_expm1_pd
_mm512_mask_expm1_ph
_mm512_mask_expm1_ps
_mm512_mask_floor_ph
_mm512_mask_hypot_pd
_mm512_mask_hypot_ps
_mm512_mask_invsqrt_pd
_mm512_mask_invsqrt_ph
_mm512_mask_invsqrt_ps
_mm512_mask_log10_pd
_mm512_mask_log10_ph
_mm512_mask_log10_ps
_mm512_mask_log1p_pd
_mm512_mask_log1p_ph
_mm512_mask_log1p_ps
_mm512_mask_log2_pd
_mm512_mask_log2_ph
_mm512_mask_log2_ps
_mm512_mask_log_pd
_mm512_mask_log_ph
_mm512_mask_log_ps
_mm512_mask_logb_pd
_mm512_mask_logb_ph
_mm512_mask_logb_ps
_mm512_mask_nearbyint_pd
_mm512_mask_nearbyint_ph
_mm512_mask_nearbyint_ps
_mm512_mask_pow_pd
_mm512_mask_pow_ps
_mm512_mask_recip_pd
_mm512_mask_recip_ph
_mm512_mask_recip_ps
_mm512_mask_rem_epi32
_mm512_mask_rem_epu32
_mm512_mask_rint_pd
_mm512_mask_rint_ph
_mm512_mask_rint_ps
_mm512_mask_sin_pd
_mm512_mask_sin_ph
_mm512_mask_sin_ps
_mm512_mask_sind_pd
_mm512_mask_sind_ph
_mm512_mask_sind_ps
_mm512_mask_sinh_pd
_mm512_mask_sinh_ph
_mm512_mask_sinh_ps
_mm512_mask_svml_round_pd
_mm512_mask_svml_round_ph
_mm512_mask_tan_pd
_mm512_mask_tan_ph
_mm512_mask_tan_ps
_mm512_mask_tand_pd
_mm512_mask_tand_ph
_mm512_mask_tand_ps
_mm512_mask_tanh_pd
_mm512_mask_tanh_ph
_mm512_mask_tanh_ps
_mm512_mask_trunc_pd
_mm512_mask_trunc_ph
_mm512_mask_trunc_ps
_mm512_nearbyint_pd
_mm512_nearbyint_ph
_mm512_nearbyint_ps
_mm512_pow_pd
_mm512_pow_ph
_mm512_pow_ps
_mm512_recip_pd
_mm512_recip_ph
_mm512_recip_ps
_mm512_rem_epi16
_mm512_rem_epi32
_mm512_rem_epi64
_mm512_rem_epi8
_mm512_rem_epu16
_mm512_rem_epu32
_mm512_rem_epu64
_mm512_rem_epu8
_mm512_rint_pd
_mm512_rint_ph
_mm512_rint_ps
_mm512_sin_pd
_mm512_sin_ph
_mm512_sin_ps
_mm512_sind_pd
_mm512_sind_ph
_mm512_sind_ps
_mm512_sinh_pd
_mm512_sinh_ph
_mm512_sinh_ps
_mm512_svml_round_pd
_mm512_svml_round_ph
_mm512_tan_pd
_mm512_tan_ph
_mm512_tan_ps
_mm512_tand_pd
_mm512_tand_ph
_mm512_tand_ps
_mm512_tanh_pd
_mm512_tanh_ph
_mm512_tanh_ps
_mm512_trunc_pd
_mm512_trunc_ph
_mm512_trunc_ps
_mm_acos_pd
_mm_acos_ph
_mm_acos_ps
_mm_acosh_pd
_mm_acosh_ph
_mm_acosh_ps
_mm_asin_pd
_mm_asin_ph
_mm_asin_ps
_mm_asinh_pd
_mm_asinh_ph
_mm_asinh_ps
_mm_atan2_pd
_mm_atan2_ph
_mm_atan2_ps
_mm_atan_pd
_mm_atan_ph
_mm_atan_ps
_mm_atanh_pd
_mm_atanh_ph
_mm_atanh_ps
_mm_cbrt_pd
_mm_cbrt_ph
_mm_cbrt_ps
_mm_cdfnorm_pd
_mm_cdfnorm_ph
_mm_cdfnorm_ps
_mm_cdfnorminv_pd
_mm_cdfnorminv_ph
_mm_cdfnorminv_ps
_mm_cexp_ps
_mm_clog_ps
_mm_cos_pd
_mm_cos_ph
_mm_cos_ps
_mm_cosd_pd
_mm_cosd_ph
_mm_cosd_ps
_mm_cosh_pd
_mm_cosh_ph
_mm_cosh_ps
_mm_csqrt_ps
_mm_cvtsd_si64x
_mm_cvtsi128_si64x
_mm_cvtsi64x_sd
_mm_cvtsi64x_si128
_mm_cvttsd_si64x
_mm_div_epi16
_mm_div_epi32
_mm_div_epi64
_mm_div_epi8
_mm_div_epu16
_mm_div_epu32
_mm_div_epu64
_mm_div_epu8
_mm_dpbssd_epi32
_mm_dpbssds_epi32
_mm_dpbsud_epi32
_mm_dpbsuds_epi32
_mm_dpbuud_epi32
_mm_dpbuuds_epi32
_mm_dpwsud_epi32
_mm_dpwsuds_epi32
_mm_dpwusd_epi32
_mm_dpwusds_epi32
_mm_dpwuud_epi32
_mm_dpwuuds_epi32
_mm_erf_pd
_mm_erf_ph
_mm_erf_ps
_mm_erfc_pd
_mm_erfc_ph
_mm_erfc_ps
_mm_erfcinv_pd
_mm_erfcinv_ph
_mm_erfcinv_ps
_mm_erfinv_pd
_mm_erfinv_ph
_mm_erfinv_ps
_mm_exp10_pd
_mm_exp10_ph
_mm_exp10_ps
_mm_exp2_pd
_mm_exp2_ph
_mm_exp2_ps
_mm_exp_pd
_mm_exp_ph
_mm_exp_ps
_mm_expm1_pd
_mm_expm1_ph
_mm_expm1_ps
_mm_hypot_pd
_mm_hypot_ph
_mm_hypot_ps
_mm_idiv_epi32
_mm_invcbrt_pd
_mm_invcbrt_ph
_mm_invcbrt_ps
_mm_invsqrt_pd
_mm_invsqrt_ph
_mm_invsqrt_ps
_mm_irem_epi32
_mm_log10_pd
_mm_log10_ph
_mm_log10_ps
_mm_log1p_pd
_mm_log1p_ph
_mm_log1p_ps
_mm_log2_pd
_mm_log2_ph
_mm_log2_ps
_mm_log_pd
_mm_log_ph
_mm_log_ps
_mm_logb_pd
_mm_logb_ph
_mm_logb_ps
_mm_madd52hi_avx_epu64
_mm_madd52lo_avx_epu64
_mm_mask_reduce_add_epi16
_mm_mask_reduce_add_epi8
_mm_mask_reduce_and_epi16
_mm_mask_reduce_and_epi8
_mm_mask_reduce_max_epi16
_mm_mask_reduce_max_epi8
_mm_mask_reduce_max_epu16
_mm_mask_reduce_max_epu8
_mm_mask_reduce_min_epi16
_mm_mask_reduce_min_epi8
_mm_mask_reduce_min_epu16
_mm_mask_reduce_min_epu8
_mm_mask_reduce_mul_epi16
_mm_mask_reduce_mul_epi8
_mm_mask_reduce_or_epi16
_mm_mask_reduce_or_epi8
_mm_pow_pd
_mm_pow_ph
_mm_pow_ps
_mm_reduce_add_epi16
_mm_reduce_add_epi8
_mm_reduce_and_epi16
_mm_reduce_and_epi8
_mm_reduce_max_epi16
_mm_reduce_max_epi8
_mm_reduce_max_epu16
_mm_reduce_max_epu8
_mm_reduce_min_epi16
_mm_reduce_min_epi8
_mm_reduce_min_epu16
_mm_reduce_min_epu8
_mm_reduce_mul_epi16
_mm_reduce_mul_epi8
_mm_reduce_or_epi16
_mm_reduce_or_epi8
_mm_rem_epi16
_mm_rem_epi32
_mm_rem_epi64
_mm_rem_epi8
_mm_rem_epu16
_mm_rem_epu32
_mm_rem_epu64
_mm_rem_epu8
_mm_sin_pd
_mm_sin_ph
_mm_sin_ps
_mm_sind_pd
_mm_sind_ph
_mm_sind_ps
_mm_sinh_pd
_mm_sinh_ph
_mm_sinh_ps
_mm_sm3msg1_epi32
_mm_sm3msg2_epi32
_mm_sm3rnds2_epi32
_mm_sm4key4_epi32
_mm_sm4rnds4_epi32
_mm_svml_ceil_pd
_mm_svml_ceil_ph
_mm_svml_ceil_ps
_mm_svml_floor_pd
_mm_svml_floor_ph
_mm_svml_floor_ps
_mm_svml_round_pd
_mm_svml_round_ph
_mm_svml_round_ps
_mm_svml_sqrt_pd
_mm_svml_sqrt_ph
_mm_svml_sqrt_ps
_mm_tan_pd
_mm_tan_ph
_mm_tan_ps
_mm_tand_pd
_mm_tand_ph
_mm_tand_ps
_mm_tanh_pd
_mm_tanh_ph
_mm_tanh_ps
_mm_trunc_pd
_mm_trunc_ph
_mm_trunc_ps
_mm_udiv_epi32
_mm_urem_epi32
_popcnt32
_popcnt64
_rdpmc
_rotl
_rotl64
_rotr
_rotr64
_rotwl
_rotwr
_urdmsr
# Cannot find value in this scope (in Rust testfiles)
_mm512_set1_pch
_mm_abs_pi16
_mm_abs_pi32
_mm_abs_pi8
_mm_add_pi16
_mm_add_pi32
_mm_add_pi8
_mm_add_si64
_mm_adds_pi16
_mm_adds_pi8
_mm_adds_pu16
_mm_adds_pu8
_mm_alignr_pi8
_mm_and_si64
_mm_andnot_si64
_mm_avg_pu16
_mm_avg_pu8
_mm_cmpeq_pi16
_mm_cmpeq_pi32
_mm_cmpeq_pi8
_mm_cmpgt_pi16
_mm_cmpgt_pi32
_mm_cmpgt_pi8
_mm_cvt_pi2ps
_mm_cvt_ps2pi
_mm_cvtm64_si64
_mm_cvtpd_pi32
_mm_cvtpi16_ps
_mm_cvtpi32_pd
_mm_cvtpi32_ps
_mm_cvtpi32x2_ps
_mm_cvtpi8_ps
_mm_cvtps_pi16
_mm_cvtps_pi32
_mm_cvtps_pi8
_mm_cvtpu16_ps
_mm_cvtpu8_ps
_mm_cvtsi32_si64
_mm_cvtsi64_m64
_mm_cvtsi64_si32
_mm_cvtt_ps2pi
_mm_cvttpd_pi32
_mm512_cbrt_pd
_mm512_cbrt_ph
_mm512_cbrt_ps
_mm512_cdfnorm_pd
_mm512_cdfnorm_ph
_mm512_cdfnorm_ps
_mm512_cdfnorminv_pd
_mm512_cdfnorminv_ph
_mm512_cdfnorminv_ps
_mm512_ceil_pd
_mm512_ceil_ph
_mm512_ceil_ps
_mm512_floor_pd
_mm512_floor_ps
_mm512_mask_ceil_pd
_mm512_mask_ceil_ps
_mm_max_pi16
_mm_max_pu8
_mm_min_pi16
_mm_min_pu8
_mm_movemask_pi8
_mm_movepi64_pi64
_mm_movpi64_epi64
_mm_mul_su32
_mm_mulhi_pi16
_mm_mulhi_pu16
_mm_mulhrs_pi16
_mm_mullo_pi16
_mm_or_si64
_mm_packs_pi16
_mm_packs_pi32
_mm_packs_pu16
_mm_popcnt_u32
_mm_popcnt_u64
_mm_sad_pu8
_mm_set1_epi64
_mm_set1_pch
_mm_set1_pi16
_mm_set1_pi32
_mm_set1_pi8
_mm_set_epi64
_mm_set_pi16
_mm_set_pi32
_mm_set_pi8
_mm_setr_epi64
_mm_setr_pi16
_mm_setr_pi32
_mm_setr_pi8
_mm_shuffle_pi16
_mm_shuffle_pi8
_mm_sign_pi16
_mm_sign_pi32
_mm_sign_pi8
_mm_sll_pi16
_mm_sll_pi32
_mm_sll_si64
_mm_slli_pi16
_mm_slli_pi32
_mm_slli_si64
_mm_sra_pi16
_mm_sra_pi32
_mm_srai_pi16
_mm_srai_pi32
_mm_srl_pi16
_mm_srl_pi32
_mm_srl_si64
_mm_srli_pi16
_mm_srli_pi32
_mm_srli_si64
_mm_sub_pi16
_mm_sub_pi32
_mm_sub_pi8
_mm_sub_si64
_mm_subs_pi16
_mm_subs_pi8
_mm_subs_pu16
_mm_subs_pu8
_mm_unpackhi_pi16
_mm_unpackhi_pi32
_mm_unpackhi_pi8
_mm_unpacklo_pi16
_mm_unpacklo_pi32
_mm_unpacklo_pi8
_mm_xor_si64
_mm256_pow_pd
_mm256_pow_ph
_mm256_pow_ps
_mm256_rem_epi16
_mm256_rem_epi32
_mm256_rem_epi64
_mm256_rem_epi8
_mm256_rem_epu16
_mm256_rem_epu32
_mm256_rem_epu64
_mm256_rem_epu8
_mm256_set1_pch
_mm256_sin_pd
_mm256_sin_ph
_mm256_sin_ps
_mm256_sind_pd
_mm256_sind_ph
_mm256_sind_ps
_mm256_sinh_pd
_mm256_sinh_ph
_mm256_sinh_ps
_mm256_svml_ceil_pd
_mm256_svml_ceil_ph
_mm256_svml_ceil_ps
_mm256_svml_floor_pd
_mm256_svml_floor_ph
_mm256_svml_floor_ps
_mm256_svml_round_pd
_mm256_svml_round_ph
_mm256_svml_round_ps
_mm256_svml_sqrt_pd
_mm256_svml_sqrt_ph
_mm256_svml_sqrt_ps
_mm256_tan_pd
_mm256_tan_ph
_mm256_tan_ps
_mm256_tand_pd
_mm256_tand_ph
_mm256_tand_ps
_mm256_tanh_pd
_mm256_tanh_ph
_mm256_tanh_ps
_mm256_trunc_pd
_mm256_trunc_ph
_mm256_trunc_ps
_mm256_udiv_epi32
_mm256_urem_epi32
_mm512_acos_pd
_mm512_acos_ph
_mm512_acos_ps
_mm512_acosh_pd
_mm512_acosh_ph
_mm512_acosh_ps
_mm_cvttps_pi32
_mm_extract_pi16
_mm_hadd_pi16
_mm_hadd_pi32
_mm_hadds_pi16
_mm_hsub_pi16
_mm_hsub_pi32
_mm_hsubs_pi16
_mm_insert_pi16
_mm_madd_pi16
_mm_maddubs_pi16
_mm512_asin_pd
_mm512_asin_ph
_mm512_asin_ps
_mm512_asinh_pd
_mm512_asinh_ph
_mm512_asinh_ps
_mm512_atan2_pd
_mm512_atan2_ph
_mm512_atan2_ps
_mm512_atan_pd
_mm512_atan_ph
_mm512_atan_ps
_mm512_atanh_pd
_mm512_atanh_ph
_mm512_atanh_ps
_cvtsh_ss
_cvtss_sh
_m_from_int
_m_from_int64
_m_packssdw
_m_packsswb
_m_packuswb
_m_paddb
_m_paddd
_m_paddsb
_m_paddsw
_m_paddusb
_m_paddusw
_m_paddw
_m_pand
_m_pandn
_m_pavgb
_m_pavgw
_m_pcmpeqb
_m_pcmpeqd
_m_pcmpeqw
_m_pcmpgtb
_m_pcmpgtd
_m_pcmpgtw
_m_pextrw
_m_pinsrw
_m_pmaddwd
_m_pmaxsw
_m_pmaxub
_m_pminsw
_m_pminub
_m_pmovmskb
_m_pmulhuw
_m_pmulhw
_m_pmullw
_m_por
_m_psadbw
_m_pshufw
_m_pslld
_m_pslldi
_m_psllq
_m_psllqi
_m_psllw
_m_psllwi
_m_psrad
_m_psradi
_m_psraw
_m_psrawi
_m_psrld
_m_psrldi
_m_psrlq
_m_psrlqi
_m_psrlw
_m_psrlwi
_m_psubb
_m_psubd
_m_psubsb
_m_psubsw
_m_psubusb
_m_psubusw
_m_psubw
_m_punpckhbw
_m_punpckhdq
_m_punpckhwd
_m_punpcklbw
_m_punpckldq
_m_punpcklwd
_m_pxor
_m_to_int
_m_to_int64
_mm512_mask_floor_pd
_mm512_mask_floor_ps
# SDE ERROR: Cannot execute XGETBV with ECX != 0
_xgetbv
# Miscellaneous issues that can be fixed first
_kshiftli_mask16
_kshiftli_mask32
_kshiftli_mask64
_kshiftli_mask8
_kshiftri_mask16
_kshiftri_mask32
_kshiftri_mask64
_kshiftri_mask8
_mm256_castsi128_si256
_mm256_extract_epi16
_mm256_extract_epi8
_mm512_castsi128_si512
_mm512_castsi256_si512
# _mm512_conj_pch
_mm512_mask_reduce_max_pd
_mm512_mask_reduce_max_ps
_mm512_mask_reduce_min_pd
_mm512_mask_reduce_min_ps
_mm_comineq_sh
_mm_extract_epi16
_mm_extract_epi8
_mm_mask_cvtepi16_epi8
_mm_mask_cvtpd_epi32
_mm_mask_cvtpd_ps
_mm_ucomineq_sh

View file

@ -3,13 +3,24 @@ pub const NOTICE: &str = "\
// test are derived from a JSON specification, published under the same license as the
// `intrinsic-test` crate.\n";
pub const POLY128_OSTREAM_DECL: &str = r#"
pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#"
#ifdef __aarch64__
std::ostream& operator<<(std::ostream& os, poly128_t value);
#endif
std::ostream& operator<<(std::ostream& os, float16_t value);
std::ostream& operator<<(std::ostream& os, uint8_t value);
// T1 is the `To` type, T2 is the `From` type
template<typename T1, typename T2> T1 cast(T2 x) {
static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
T1 ret{};
memcpy(&ret, &x, sizeof(T1));
return ret;
}
"#;
pub const POLY128_OSTREAM_DEF: &str = r#"
pub const PLATFORM_C_DEFINITIONS: &str = r#"
#ifdef __aarch64__
std::ostream& operator<<(std::ostream& os, poly128_t value) {
std::stringstream temp;
@ -23,11 +34,26 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) {
os << res;
return os;
}
#endif
std::ostream& operator<<(std::ostream& os, float16_t value) {
uint16_t temp = 0;
memcpy(&temp, &value, sizeof(float16_t));
std::stringstream ss;
ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
os << ss.str();
return os;
}
std::ostream& operator<<(std::ostream& os, uint8_t value) {
os << (unsigned int) value;
return os;
}
"#;
// Format f16 values (and vectors containing them) in a way that is consistent with C.
pub const F16_FORMATTING_DEF: &str = r#"
pub const PLATFORM_RUST_DEFINITIONS: &str = r#"
/// Used to continue `Debug`ging SIMD types as `MySimd(1, 2, 3, 4)`, as they
/// were before moving to array-based simd.
#[inline]
@ -113,7 +139,7 @@ impl DebugHexF16 for float16x8x4_t {
}
"#;
pub const AARCH_CONFIGURATIONS: &str = r#"
pub const PLATFORM_RUST_CFGS: &str = r#"
#![cfg_attr(target_arch = "arm", feature(stdarch_arm_neon_intrinsics))]
#![cfg_attr(target_arch = "arm", feature(stdarch_aarch32_crc32))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_fcma))]
@ -121,12 +147,13 @@ pub const AARCH_CONFIGURATIONS: &str = r#"
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_i8mm))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sm4))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_ftts))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_aarch64_jscvt))]
#![feature(fmt_helpers_for_derive)]
#![feature(stdarch_neon_f16)]
#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
use core::arch::aarch64::*;
use core_arch::arch::aarch64::*;
#[cfg(target_arch = "arm")]
use core::arch::arm::*;
use core_arch::arch::arm::*;
"#;

View file

@ -32,11 +32,11 @@ impl SupportedArchitectureTest for ArmArchitectureTest {
const NOTICE: &str = config::NOTICE;
const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"];
const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF;
const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL;
const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS;
const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS;
const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF;
const PLATFORM_RUST_CFGS: &str = config::AARCH_CONFIGURATIONS;
const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS;
const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS;
fn cpp_compilation(&self) -> Option<CppCompilation> {
compile::build_cpp_compilation(&self.cli_options)

View file

@ -14,10 +14,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
(None, None) => format!("{const_prefix}{prefix}{bit_len}_t"),
(Some(simd), None) => format!("{prefix}{bit_len}x{simd}_t"),
(Some(simd), Some(vec)) => format!("{prefix}{bit_len}x{simd}x{vec}_t"),
(None, Some(_)) => todo!("{:#?}", self), // Likely an invalid case
(None, Some(_)) => todo!("{self:#?}"), // Likely an invalid case
}
} else {
todo!("{:#?}", self)
todo!("{self:#?}")
}
}
@ -58,14 +58,14 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
// The ACLE doesn't support 64-bit polynomial loads on Armv7
// if armv7 and bl == 64, use "s", else "p"
TypeKind::Poly => if choose_workaround && *bl == 64 {"s"} else {"p"},
x => todo!("get_load_function TypeKind: {:#?}", x),
x => todo!("get_load_function TypeKind: {x:#?}"),
},
size = bl,
quad = quad,
len = vec_len.unwrap_or(1),
)
} else {
todo!("get_load_function IntrinsicType: {:#?}", self)
todo!("get_load_function IntrinsicType: {self:#?}")
}
}
@ -90,13 +90,13 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
TypeKind::Int(Sign::Signed) => "s",
TypeKind::Float => "f",
TypeKind::Poly => "p",
x => todo!("get_load_function TypeKind: {:#?}", x),
x => todo!("get_load_function TypeKind: {x:#?}"),
},
size = bl,
quad = quad,
)
} else {
todo!("get_lane_function IntrinsicType: {:#?}", self)
todo!("get_lane_function IntrinsicType: {self:#?}")
}
}
@ -112,12 +112,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
ty = self.c_single_vector_type(),
lanes = (0..self.num_lanes())
.map(move |idx| -> std::string::String {
let lane_fn = self.get_lane_function();
let final_cast = self.generate_final_type_cast();
format!(
"{cast}{lane_fn}(__return_value.val[{vector}], {lane})",
cast = self.c_promotion(),
lane_fn = self.get_lane_function(),
lane = idx,
vector = vector,
"{final_cast}{lane_fn}(__return_value.val[{vector}], {idx})"
)
})
.collect::<Vec<_>>()
@ -129,12 +127,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
} else if self.num_lanes() > 1 {
(0..self.num_lanes())
.map(|idx| -> std::string::String {
format!(
"{cast}{lane_fn}(__return_value, {lane})",
cast = self.c_promotion(),
lane_fn = self.get_lane_function(),
lane = idx
)
let lane_fn = self.get_lane_function();
let final_cast = self.generate_final_type_cast();
format!("{final_cast}{lane_fn}(__return_value, {idx})")
})
.collect::<Vec<_>>()
.join(r#" << ", " << "#)
@ -148,9 +143,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType {
TypeKind::Int(Sign::Signed) => format!("int{}_t", self.inner_size()),
TypeKind::Int(Sign::Unsigned) => format!("uint{}_t", self.inner_size()),
TypeKind::Poly => format!("poly{}_t", self.inner_size()),
ty => todo!("print_result_c - Unknown type: {:#?}", ty),
ty => todo!("print_result_c - Unknown type: {ty:#?}"),
},
promote = self.c_promotion(),
promote = self.generate_final_type_cast(),
)
};

View file

@ -30,7 +30,12 @@ where
}
pub fn to_c_type(&self) -> String {
self.ty.c_type()
let prefix = if self.ty.constant { "const " } else { "" };
format!("{prefix}{}", self.ty.c_type())
}
pub fn generate_name(&self) -> String {
format!("{}_val", self.name)
}
pub fn is_simd(&self) -> bool {
@ -55,16 +60,22 @@ where
}
/// The name (e.g. "A_VALS" or "a_vals") for the array of possible test inputs.
fn rust_vals_array_name(&self) -> impl std::fmt::Display {
pub(crate) fn rust_vals_array_name(&self) -> impl std::fmt::Display {
if self.ty.is_rust_vals_array_const() {
format!("{}_VALS", self.name.to_uppercase())
let loads = crate::common::gen_rust::PASSES;
format!(
"{}_{ty}_{load_size}",
self.name.to_uppercase(),
ty = self.ty.rust_scalar_type(),
load_size = self.ty.num_lanes() * self.ty.num_vectors() + loads - 1,
)
} else {
format!("{}_vals", self.name.to_lowercase())
}
}
fn as_call_param_c(&self) -> String {
self.ty.as_call_param_c(&self.name)
self.ty.as_call_param_c(&self.generate_name())
}
}
@ -91,7 +102,7 @@ where
pub fn as_call_param_rust(&self) -> String {
self.iter()
.filter(|a| !a.has_constraint())
.map(|arg| arg.name.clone())
.map(|arg| arg.generate_name() + " as _")
.collect::<Vec<String>>()
.join(", ")
}
@ -106,11 +117,13 @@ where
loads: u32,
) -> std::io::Result<()> {
for arg in self.iter().filter(|&arg| !arg.has_constraint()) {
// Setting the variables on an aligned boundary to make it easier to pick
// functions (of a specific architecture) that would help load the values.
writeln!(
w,
"{indentation}const {ty} {name}_vals[] = {values};",
"{indentation}alignas(64) const {ty} {name}_vals[] = {values};",
ty = arg.ty.c_scalar_type(),
name = arg.name,
name = arg.generate_name(),
values = arg.ty.populate_random(indentation, loads, &Language::C)
)?
}
@ -127,20 +140,34 @@ where
loads: u32,
) -> std::io::Result<()> {
for arg in self.iter().filter(|&arg| !arg.has_constraint()) {
writeln!(
w,
"{indentation}{bind} {name}: [{ty}; {load_size}] = {values};",
bind = arg.rust_vals_array_binding(),
name = arg.rust_vals_array_name(),
ty = arg.ty.rust_scalar_type(),
load_size = arg.ty.num_lanes() * arg.ty.num_vectors() + loads - 1,
values = arg.ty.populate_random(indentation, loads, &Language::Rust)
)?
// Constants are defined globally.
if arg.ty.is_rust_vals_array_const() {
continue;
}
Self::gen_arg_rust(arg, w, indentation, loads)?;
}
Ok(())
}
pub fn gen_arg_rust(
arg: &Argument<T>,
w: &mut impl std::io::Write,
indentation: Indentation,
loads: u32,
) -> std::io::Result<()> {
writeln!(
w,
"{indentation}{bind} {name}: [{ty}; {load_size}] = {values};\n",
bind = arg.rust_vals_array_binding(),
name = arg.rust_vals_array_name(),
ty = arg.ty.rust_scalar_type(),
load_size = arg.ty.num_lanes() * arg.ty.num_vectors() + loads - 1,
values = arg.ty.populate_random(indentation, loads, &Language::Rust)
)
}
/// Creates a line for each argument that initializes the argument from an array `[arg]_vals` at
/// an offset `i` using a load intrinsic, in C.
/// e.g `uint8x8_t a = vld1_u8(&a_vals[i]);`
@ -153,7 +180,7 @@ where
format!(
"{indentation}{ty} {name} = cast<{ty}>({load}(&{name}_vals[i]));\n",
ty = arg.to_c_type(),
name = arg.name,
name = arg.generate_name(),
load = if arg.is_simd() {
arg.ty.get_load_function(Language::C)
} else {
@ -171,15 +198,16 @@ where
self.iter()
.filter(|&arg| !arg.has_constraint())
.map(|arg| {
let load = if arg.is_simd() {
arg.ty.get_load_function(Language::Rust)
} else {
"*".to_string()
};
let typecast = if load.len() > 2 { "as _" } else { "" };
format!(
"{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i));\n",
name = arg.name,
"{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i){typecast});\n",
name = arg.generate_name(),
vals_name = arg.rust_vals_array_name(),
load = if arg.is_simd() {
arg.ty.get_load_function(Language::Rust)
} else {
"*".to_string()
},
)
})
.collect()

View file

@ -54,6 +54,9 @@ pub struct Cli {
/// Set the sysroot for the C++ compiler
#[arg(long)]
pub cxx_toolchain_dir: Option<String>,
#[arg(long, default_value_t = 100u8)]
pub sample_percentage: u8,
}
pub struct ProcessedCli {
@ -65,6 +68,7 @@ pub struct ProcessedCli {
pub linker: Option<String>,
pub cxx_toolchain_dir: Option<String>,
pub skip: Vec<String>,
pub sample_percentage: u8,
}
impl ProcessedCli {
@ -74,6 +78,7 @@ impl ProcessedCli {
let target = cli_options.target;
let linker = cli_options.linker;
let cxx_toolchain_dir = cli_options.cxx_toolchain_dir;
let sample_percentage = cli_options.sample_percentage;
let skip = if let Some(filename) = cli_options.skip {
let data = std::fs::read_to_string(&filename).expect("Failed to open file");
@ -108,6 +113,7 @@ impl ProcessedCli {
cxx_toolchain_dir,
skip,
filename,
sample_percentage,
}
}
}

View file

@ -14,15 +14,14 @@ pub fn compare_outputs(intrinsic_name_list: &Vec<String>, runner: &str, target:
let intrinsics = intrinsic_name_list
.par_iter()
.filter_map(|intrinsic_name| {
let c = runner_command(runner)
.arg("intrinsic-test-programs")
.arg("./intrinsic-test-programs")
.arg(intrinsic_name)
.current_dir("c_programs")
.output();
let rust = runner_command(runner)
.arg(format!("target/{target}/release/intrinsic-test-programs"))
.arg(format!("./target/{target}/release/intrinsic-test-programs"))
.arg(intrinsic_name)
.current_dir("rust_programs")
.output();

View file

@ -119,7 +119,7 @@ impl CppCompilation {
output: &str,
) -> std::io::Result<std::process::Output> {
let mut cmd = clone_command(&self.0);
cmd.args([input, "-c", "-o", output]);
cmd.args([input, "-v", "-c", "-o", output]);
cmd.output()
}

View file

@ -6,6 +6,15 @@ use super::intrinsic_helpers::IntrinsicTypeDefinition;
// The number of times each intrinsic will be called.
const PASSES: u32 = 20;
const COMMON_HEADERS: [&str; 7] = [
"iostream",
"string",
"cstring",
"iomanip",
"sstream",
"type_traits",
"cassert",
];
pub fn generate_c_test_loop<T: IntrinsicTypeDefinition + Sized>(
w: &mut impl std::io::Write,
@ -47,7 +56,15 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>(
let ty = current.ty.c_type();
writeln!(w, "{indentation}{{")?;
writeln!(w, "{body_indentation}{ty} {} = {i};", current.name)?;
// TODO: Move to actually specifying the enum value
// instead of typecasting integers, for better clarity
// of generated code.
writeln!(
w,
"{body_indentation}const {ty} {} = ({ty}){i};",
current.generate_name()
)?;
generate_c_constraint_blocks(
w,
@ -99,32 +116,10 @@ pub fn write_mod_cpp<T: IntrinsicTypeDefinition>(
) -> std::io::Result<()> {
write!(w, "{notice}")?;
for header in platform_headers {
for header in COMMON_HEADERS.iter().chain(platform_headers.iter()) {
writeln!(w, "#include <{header}>")?;
}
writeln!(
w,
r#"
#include <iostream>
#include <cstring>
#include <iomanip>
#include <sstream>
template<typename T1, typename T2> T1 cast(T2 x) {{
static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
T1 ret{{}};
memcpy(&ret, &x, sizeof(T1));
return ret;
}}
std::ostream& operator<<(std::ostream& os, float16_t value);
"#
)?;
writeln!(w, "{}", forward_declarations)?;
for intrinsic in intrinsics {
@ -137,33 +132,13 @@ std::ostream& operator<<(std::ostream& os, float16_t value);
pub fn write_main_cpp<'a>(
w: &mut impl std::io::Write,
arch_specific_definitions: &str,
arch_specific_headers: &[&str],
intrinsics: impl Iterator<Item = &'a str> + Clone,
) -> std::io::Result<()> {
writeln!(w, "#include <iostream>")?;
writeln!(w, "#include <string>")?;
for header in ["arm_neon.h", "arm_acle.h", "arm_fp16.h"] {
for header in COMMON_HEADERS.iter().chain(arch_specific_headers.iter()) {
writeln!(w, "#include <{header}>")?;
}
writeln!(
w,
r#"
#include <cstring>
#include <iomanip>
#include <sstream>
std::ostream& operator<<(std::ostream& os, float16_t value) {{
uint16_t temp = 0;
memcpy(&temp, &value, sizeof(float16_t));
std::stringstream ss;
ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
os << ss.str();
return os;
}}
"#
)?;
// NOTE: It's assumed that this value contains the required `ifdef`s.
writeln!(w, "{arch_specific_definitions }")?;

View file

@ -1,25 +1,31 @@
use itertools::Itertools;
use std::process::Command;
use crate::common::argument::ArgumentList;
use crate::common::intrinsic::Intrinsic;
use super::indentation::Indentation;
use super::intrinsic::format_f16_return_value;
use super::intrinsic_helpers::IntrinsicTypeDefinition;
// The number of times each intrinsic will be called.
const PASSES: u32 = 20;
pub(crate) const PASSES: u32 = 20;
macro_rules! concatln {
($($lines:expr),* $(,)?) => {
concat!($( $lines, "\n" ),*)
};
}
fn write_cargo_toml_header(w: &mut impl std::io::Write, name: &str) -> std::io::Result<()> {
writeln!(
w,
concat!(
"[package]\n",
"name = \"{name}\"\n",
"version = \"{version}\"\n",
"authors = [{authors}]\n",
"license = \"{license}\"\n",
"edition = \"2018\"\n",
concatln!(
"[package]",
"name = \"{name}\"",
"version = \"{version}\"",
"authors = [{authors}]",
"license = \"{license}\"",
"edition = \"2018\"",
),
name = name,
version = env!("CARGO_PKG_VERSION"),
@ -37,6 +43,7 @@ pub fn write_bin_cargo_toml(
write_cargo_toml_header(w, "intrinsic-test-programs")?;
writeln!(w, "[dependencies]")?;
writeln!(w, "core_arch = {{ path = \"../crates/core_arch\" }}")?;
for i in 0..module_count {
writeln!(w, "mod_{i} = {{ path = \"mod_{i}/\" }}")?;
@ -118,6 +125,20 @@ pub fn write_lib_rs<T: IntrinsicTypeDefinition>(
writeln!(w, "{definitions}")?;
let mut seen = std::collections::HashSet::new();
for intrinsic in intrinsics {
for arg in &intrinsic.arguments.args {
if !arg.has_constraint() && arg.ty.is_rust_vals_array_const() {
let name = arg.rust_vals_array_name().to_string();
if seen.insert(name) {
ArgumentList::gen_arg_rust(arg, w, Indentation::default(), PASSES)?;
}
}
}
}
for intrinsic in intrinsics {
crate::common::gen_rust::create_rust_test_module(w, intrinsic)?;
}
@ -190,7 +211,7 @@ pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
w: &mut impl std::io::Write,
intrinsic: &Intrinsic<T>,
indentation: Indentation,
specializations: &[Vec<u8>],
specializations: &[Vec<i32>],
passes: u32,
) -> std::io::Result<()> {
let intrinsic_name = &intrinsic.name;
@ -232,30 +253,30 @@ pub fn generate_rust_test_loop<T: IntrinsicTypeDefinition>(
}
}
let return_value = format_f16_return_value(intrinsic);
let indentation2 = indentation.nested();
let indentation3 = indentation2.nested();
writeln!(
write!(
w,
"\
for (id, f) in specializations {{\n\
for i in 0..{passes} {{\n\
unsafe {{\n\
{loaded_args}\
let __return_value = f({args});\n\
println!(\"Result {{id}}-{{}}: {{:?}}\", i + 1, {return_value});\n\
}}\n\
}}\n\
}}",
loaded_args = intrinsic.arguments.load_values_rust(indentation3),
concatln!(
" for (id, f) in specializations {{",
" for i in 0..{passes} {{",
" unsafe {{",
"{loaded_args}",
" let __return_value = f({args});",
" println!(\"Result {{id}}-{{}}: {{:?}}\", i + 1, {return_value});",
" }}",
" }}",
" }}",
),
loaded_args = intrinsic.arguments.load_values_rust(indentation.nest_by(4)),
args = intrinsic.arguments.as_call_param_rust(),
return_value = intrinsic.results.print_result_rust(),
passes = passes,
)
}
/// Generate the specializations (unique sequences of const-generic arguments) for this intrinsic.
fn generate_rust_specializations(
constraints: &mut impl Iterator<Item = impl Iterator<Item = i64>>,
) -> Vec<Vec<u8>> {
) -> Vec<Vec<i32>> {
let mut specializations = vec![vec![]];
for constraint in constraints {
@ -263,7 +284,7 @@ fn generate_rust_specializations(
.flat_map(|right| {
specializations.iter().map(move |left| {
let mut left = left.clone();
left.push(u8::try_from(right).unwrap());
left.push(i32::try_from(right).unwrap());
left
})
})

View file

@ -10,6 +10,10 @@ impl Indentation {
pub fn nested(self) -> Self {
Self(self.0 + 1)
}
pub fn nest_by(&self, additional_levels: u32) -> Self {
Self(self.0 + additional_levels)
}
}
impl std::fmt::Display for Indentation {

View file

@ -1,5 +1,5 @@
use super::argument::ArgumentList;
use super::intrinsic_helpers::{IntrinsicTypeDefinition, TypeKind};
use super::intrinsic_helpers::IntrinsicTypeDefinition;
/// An intrinsic
#[derive(Debug, PartialEq, Clone)]
@ -16,17 +16,3 @@ pub struct Intrinsic<T: IntrinsicTypeDefinition> {
/// Any architecture-specific tags.
pub arch_tags: Vec<String>,
}
pub fn format_f16_return_value<T: IntrinsicTypeDefinition>(intrinsic: &Intrinsic<T>) -> String {
// the `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses
// a string representation of the output value to compare. In C, f16 values are currently printed
// as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print
// them as decimal floating point values. To keep the intrinsics tests working, for now, format
// vectors containing f16 values like C prints them.
let return_value = match intrinsic.results.kind() {
TypeKind::Float if intrinsic.results.inner_size() == 16 => "debug_f16(__return_value)",
_ => "format_args!(\"{__return_value:.150?}\")",
};
String::from(return_value)
}

View file

@ -1,3 +1,4 @@
use std::cmp;
use std::fmt;
use std::ops::Deref;
use std::str::FromStr;
@ -75,9 +76,11 @@ impl TypeKind {
Self::Float => "float",
Self::Int(Sign::Signed) => "int",
Self::Int(Sign::Unsigned) => "uint",
Self::Mask => "uint",
Self::Poly => "poly",
Self::Char(Sign::Signed) => "char",
_ => unreachable!("Not used: {:#?}", self),
Self::Vector => "int",
_ => unreachable!("Not used: {self:#?}"),
}
}
@ -91,7 +94,7 @@ impl TypeKind {
Self::Poly => "u",
Self::Char(Sign::Unsigned) => "u",
Self::Char(Sign::Signed) => "i",
_ => unreachable!("Unused type kind: {:#?}", self),
_ => unreachable!("Unused type kind: {self:#?}"),
}
}
}
@ -129,9 +132,9 @@ impl IntrinsicType {
pub fn inner_size(&self) -> u32 {
if let Some(bl) = self.bit_len {
bl
cmp::max(bl, 8)
} else {
unreachable!("")
unreachable!("{self:#?}")
}
}
@ -154,6 +157,7 @@ impl IntrinsicType {
pub fn c_scalar_type(&self) -> String {
match self.kind() {
TypeKind::Char(_) => String::from("char"),
TypeKind::Vector => String::from("int32_t"),
_ => format!(
"{prefix}{bits}_t",
prefix = self.kind().c_prefix(),
@ -162,14 +166,6 @@ impl IntrinsicType {
}
}
pub fn rust_scalar_type(&self) -> String {
format!(
"{prefix}{bits}",
prefix = self.kind().rust_prefix(),
bits = self.inner_size()
)
}
pub fn c_promotion(&self) -> &str {
match *self {
IntrinsicType {
@ -177,9 +173,9 @@ impl IntrinsicType {
bit_len: Some(8),
..
} => match kind {
TypeKind::Int(Sign::Signed) => "(int)",
TypeKind::Int(Sign::Unsigned) => "(unsigned int)",
TypeKind::Poly => "(unsigned int)(uint8_t)",
TypeKind::Int(Sign::Signed) => "int",
TypeKind::Int(Sign::Unsigned) => "unsigned int",
TypeKind::Poly => "uint8_t",
_ => "",
},
IntrinsicType {
@ -188,9 +184,9 @@ impl IntrinsicType {
..
} => match bit_len {
8 => unreachable!("handled above"),
16 => "(uint16_t)",
32 => "(uint32_t)",
64 => "(uint64_t)",
16 => "uint16_t",
32 => "uint32_t",
64 => "uint64_t",
128 => "",
_ => panic!("invalid bit_len"),
},
@ -199,16 +195,16 @@ impl IntrinsicType {
bit_len: Some(bit_len),
..
} => match bit_len {
16 => "(float16_t)",
32 => "(float)",
64 => "(double)",
16 => "float16_t",
32 => "float",
64 => "double",
128 => "",
_ => panic!("invalid bit_len"),
},
IntrinsicType {
kind: TypeKind::Char(_),
..
} => "(char)",
} => "char",
_ => "",
}
}
@ -221,15 +217,16 @@ impl IntrinsicType {
) -> String {
match self {
IntrinsicType {
bit_len: Some(bit_len @ (8 | 16 | 32 | 64)),
kind: kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_)),
bit_len: Some(bit_len @ (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 16 | 32 | 64)),
kind:
kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_) | TypeKind::Mask),
simd_len,
vec_len,
..
} => {
let (prefix, suffix) = match language {
Language::Rust => ("[", "]"),
Language::C => ("{", "}"),
Language::Rust => ('[', ']'),
Language::C => ('{', '}'),
};
let body_indentation = indentation.nested();
format!(
@ -265,12 +262,12 @@ impl IntrinsicType {
..
} => {
let (prefix, cast_prefix, cast_suffix, suffix) = match (language, bit_len) {
(&Language::Rust, 16) => ("[", "f16::from_bits(", ")", "]"),
(&Language::Rust, 32) => ("[", "f32::from_bits(", ")", "]"),
(&Language::Rust, 64) => ("[", "f64::from_bits(", ")", "]"),
(&Language::C, 16) => ("{", "cast<float16_t, uint16_t>(", ")", "}"),
(&Language::C, 32) => ("{", "cast<float, uint32_t>(", ")", "}"),
(&Language::C, 64) => ("{", "cast<double, uint64_t>(", ")", "}"),
(&Language::Rust, 16) => ('[', "f16::from_bits(", ")", ']'),
(&Language::Rust, 32) => ('[', "f32::from_bits(", ")", ']'),
(&Language::Rust, 64) => ('[', "f64::from_bits(", ")", ']'),
(&Language::C, 16) => ('{', "cast<float16_t, uint16_t>(", ")", '}'),
(&Language::C, 32) => ('{', "cast<float, uint32_t>(", ")", '}'),
(&Language::C, 64) => ('{', "cast<double, uint64_t>(", ")", '}'),
_ => unreachable!(),
};
format!(
@ -283,7 +280,44 @@ impl IntrinsicType {
)))
)
}
_ => unimplemented!("populate random: {:#?}", self),
IntrinsicType {
kind: TypeKind::Vector,
bit_len: Some(128 | 256 | 512),
simd_len,
vec_len,
..
} => {
let (prefix, suffix) = match language {
Language::Rust => ('[', ']'),
Language::C => ('{', '}'),
};
let body_indentation = indentation.nested();
let effective_bit_len = 32;
format!(
"{prefix}\n{body}\n{indentation}{suffix}",
body = (0..(vec_len.unwrap_or(1) * simd_len.unwrap_or(1) + loads - 1))
.format_with(",\n", |i, fmt| {
let src = value_for_array(effective_bit_len, i);
assert!(src == 0 || src.ilog2() < effective_bit_len);
if (src >> (effective_bit_len - 1)) != 0 {
// `src` is a two's complement representation of a negative value.
let mask = !0u64 >> (64 - effective_bit_len);
let ones_compl = src ^ mask;
let twos_compl = ones_compl + 1;
if (twos_compl == src) && (language == &Language::C) {
// `src` is INT*_MIN. C requires `-0x7fffffff - 1` to avoid
// undefined literal overflow behaviour.
fmt(&format_args!("{body_indentation}-{ones_compl:#x} - 1"))
} else {
fmt(&format_args!("{body_indentation}-{twos_compl:#x}"))
}
} else {
fmt(&format_args!("{body_indentation}{src:#x}"))
}
})
)
}
_ => unimplemented!("populate random: {self:#?}"),
}
}
@ -298,7 +332,7 @@ impl IntrinsicType {
kind: TypeKind::Int(_) | TypeKind::Poly,
..
} => true,
_ => unimplemented!(),
_ => true,
}
}
@ -330,4 +364,40 @@ pub trait IntrinsicTypeDefinition: Deref<Target = IntrinsicType> {
/// rust debug output format for the return type. The generated line assumes
/// there is an int i in scope which is the current pass number.
fn print_result_c(&self, indentation: Indentation, additional: &str) -> String;
/// Generates a std::cout for the intrinsics results that will match the
/// rust debug output format for the return type. The generated line assumes
/// there is an int i in scope which is the current pass number.
///
/// The `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses
/// a string representation of the output value to compare. In C, f16 values are currently printed
/// as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print
/// them as decimal floating point values. To keep the intrinsics tests working, for now, format
/// vectors containing f16 values like C prints them.
fn print_result_rust(&self) -> String {
let return_value = match self.kind() {
TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)",
_ => "format_args!(\"{__return_value:.150?}\")",
};
String::from(return_value)
}
/// To enable architecture-specific logic
fn rust_scalar_type(&self) -> String {
format!(
"{prefix}{bits}",
prefix = self.kind().rust_prefix(),
bits = self.inner_size()
)
}
fn generate_final_type_cast(&self) -> String {
let type_data = self.c_promotion();
if type_data.len() > 2 {
format!("({type_data})")
} else {
String::new()
}
}
}

View file

@ -49,7 +49,7 @@ pub trait SupportedArchitectureTest {
fn cpp_compilation(&self) -> Option<CppCompilation>;
fn build_c_file(&self) -> bool {
let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len());
let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400);
let cpp_compiler_wrapped = self.cpp_compilation();
@ -60,34 +60,42 @@ pub trait SupportedArchitectureTest {
.map(|(i, chunk)| {
let c_filename = format!("c_programs/mod_{i}.cpp");
let mut file = File::create(&c_filename).unwrap();
write_mod_cpp(
let mod_file_write_result = write_mod_cpp(
&mut file,
Self::NOTICE,
Self::PLATFORM_C_HEADERS,
Self::PLATFORM_C_FORWARD_DECLARATIONS,
chunk,
)
.unwrap();
);
if let Err(error) = mod_file_write_result {
return Err(format!("Error writing to mod_{i}.cpp: {error:?}"));
}
// compile this cpp file into a .o file.
//
// This is done because `cpp_compiler_wrapped` is None when
// the --generate-only flag is passed
trace!("compiling mod_{i}.cpp");
if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() {
let output = cpp_compiler
.compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?;
assert!(output.status.success(), "{output:?}");
}
let compile_output = cpp_compiler
.compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"));
trace!("finished compiling mod_{i}.cpp");
if let Err(compile_error) = compile_output {
return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}"));
}
}
Ok(())
})
.collect::<Result<(), std::io::Error>>()
.collect::<Result<(), String>>()
.unwrap();
let mut file = File::create("c_programs/main.cpp").unwrap();
write_main_cpp(
&mut file,
Self::PLATFORM_C_DEFINITIONS,
Self::PLATFORM_C_HEADERS,
self.intrinsics().iter().map(|i| i.name.as_str()),
)
.unwrap();
@ -96,7 +104,7 @@ pub trait SupportedArchitectureTest {
// the --generate-only flag is passed
if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() {
// compile this cpp file into a .o file
info!("compiling main.cpp");
trace!("compiling main.cpp");
let output = cpp_compiler
.compile_object_file("main.cpp", "intrinsic-test-programs.o")
.unwrap();
@ -118,7 +126,7 @@ pub trait SupportedArchitectureTest {
fn build_rust_file(&self) -> bool {
std::fs::create_dir_all("rust_programs/src").unwrap();
let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len());
let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400);
let mut cargo = File::create("rust_programs/Cargo.toml").unwrap();
write_bin_cargo_toml(&mut cargo, chunk_count).unwrap();
@ -188,9 +196,13 @@ pub trait SupportedArchitectureTest {
}
}
pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) {
let available_parallelism = std::thread::available_parallelism().unwrap().get();
let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count));
// pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) {
// let available_parallelism = std::thread::available_parallelism().unwrap().get();
// let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count));
// (chunk_size, intrinsic_count.div_ceil(chunk_size))
// }
pub fn manual_chunk(intrinsic_count: usize, chunk_size: usize) -> (usize, usize) {
(chunk_size, intrinsic_count.div_ceil(chunk_size))
}

View file

@ -4,6 +4,13 @@
pub fn value_for_array(bits: u32, index: u32) -> u64 {
let index = index as usize;
match bits {
1 => VALUES_8[index % 2].into(),
2 => VALUES_8[index % 4].into(),
3 => VALUES_8[index % 8].into(),
4 => VALUES_8[index % 16].into(),
5 => VALUES_5[index % VALUES_5.len()].into(),
6 => VALUES_6[index % VALUES_6.len()].into(),
7 => VALUES_7[index % VALUES_7.len()].into(),
8 => VALUES_8[index % VALUES_8.len()].into(),
16 => VALUES_16[index % VALUES_16.len()].into(),
32 => VALUES_32[index % VALUES_32.len()].into(),
@ -12,6 +19,24 @@ pub fn value_for_array(bits: u32, index: u32) -> u64 {
}
}
pub const VALUES_5: &[u8] = &[
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x019, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e,
0x1f,
];
pub const VALUES_6: &[u8] = &[
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x039, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
0x3f,
];
pub const VALUES_7: &[u8] = &[
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x079, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e,
0x7f,
];
pub const VALUES_8: &[u8] = &[
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0xf0, 0x80, 0x3b, 0xff,

View file

@ -3,10 +3,12 @@ extern crate log;
mod arm;
mod common;
mod x86;
use arm::ArmArchitectureTest;
use common::SupportedArchitectureTest;
use common::cli::{Cli, ProcessedCli};
use x86::X86ArchitectureTest;
fn main() {
pretty_env_logger::init();
@ -18,6 +20,7 @@ fn main() {
| "armv7-unknown-linux-gnueabihf"
| "aarch64_be-unknown-linux-gnu" => run(ArmArchitectureTest::create(processed_cli_options)),
"x86_64-unknown-linux-gnu" => run(X86ArchitectureTest::create(processed_cli_options)),
_ => std::process::exit(0),
}
}

View file

@ -0,0 +1,47 @@
use crate::common::cli::ProcessedCli;
use crate::common::compile_c::{CompilationCommandBuilder, CppCompilation};
pub fn build_cpp_compilation(config: &ProcessedCli) -> Option<CppCompilation> {
let cpp_compiler = config.cpp_compiler.as_ref()?;
// -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations
let mut command = CompilationCommandBuilder::new()
.add_arch_flags(["icelake-client"])
.set_compiler(cpp_compiler)
.set_target(&config.target)
.set_opt_level("2")
.set_cxx_toolchain_dir(config.cxx_toolchain_dir.as_deref())
.set_project_root("c_programs")
.add_extra_flags(vec![
"-ffp-contract=off",
"-Wno-narrowing",
"-mavx",
"-mavx2",
"-mavx512f",
"-msse2",
"-mavx512vl",
"-mavx512bw",
"-mavx512dq",
"-mavx512cd",
"-mavx512fp16",
"-msha512",
"-msm4",
"-mavxvnni",
"-mavx512bitalg",
"-mavx512ifma",
"-mavx512vbmi",
"-mavx512vbmi2",
"-mavx512vnni",
"-mavx512vpopcntdq",
"-ferror-limit=1000",
"-std=c++23",
]);
if !cpp_compiler.contains("clang") {
command = command.add_extra_flag("-flax-vector-conversions");
}
let cpp_compiler = command.into_cpp_compilation();
Some(cpp_compiler)
}

View file

@ -0,0 +1,409 @@
pub const NOTICE: &str = "\
// This is a transient test file, not intended for distribution. Some aspects of the
// test are derived from an XML specification, published under the same license as the
// `intrinsic-test` crate.\n";
// Format f16 values (and vectors containing them) in a way that is consistent with C.
pub const PLATFORM_RUST_DEFINITIONS: &str = r#"
use std::arch::x86_64::*;
#[inline]
unsafe fn _mm_loadu_ph_to___m128i(mem_addr: *const f16) -> __m128i {
_mm_castph_si128(_mm_loadu_ph(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_ph_to___m256i(mem_addr: *const f16) -> __m256i {
_mm256_castph_si256(_mm256_loadu_ph(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_ph_to___mm512i(mem_addr: *const f16) -> __m512i {
_mm512_castph_si512(_mm512_loadu_ph(mem_addr))
}
#[inline]
unsafe fn _mm_loadu_ps_to___m128h(mem_addr: *const f32) -> __m128h {
_mm_castps_ph(_mm_loadu_ps(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_ps_to___m256h(mem_addr: *const f32) -> __m256h {
_mm256_castps_ph(_mm256_loadu_ps(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_ps_to___m512h(mem_addr: *const f32) -> __m512h {
_mm512_castps_ph(_mm512_loadu_ps(mem_addr))
}
#[inline]
unsafe fn _mm_loadu_epi16_to___m128d(mem_addr: *const i16) -> __m128d {
_mm_castsi128_pd(_mm_loadu_epi16(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_epi16_to___m256d(mem_addr: *const i16) -> __m256d {
_mm256_castsi256_pd(_mm256_loadu_epi16(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_epi16_to___m512d(mem_addr: *const i16) -> __m512d {
_mm512_castsi512_pd(_mm512_loadu_epi16(mem_addr))
}
#[inline]
unsafe fn _mm_loadu_epi32_to___m128d(mem_addr: *const i32) -> __m128d {
_mm_castsi128_pd(_mm_loadu_epi32(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_epi32_to___m256d(mem_addr: *const i32) -> __m256d {
_mm256_castsi256_pd(_mm256_loadu_epi32(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_epi32_to___m512d(mem_addr: *const i32) -> __m512d {
_mm512_castsi512_pd(_mm512_loadu_epi32(mem_addr))
}
#[inline]
unsafe fn _mm_loadu_epi64_to___m128d(mem_addr: *const i64) -> __m128d {
_mm_castsi128_pd(_mm_loadu_epi64(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_epi64_to___m256d(mem_addr: *const i64) -> __m256d {
_mm256_castsi256_pd(_mm256_loadu_epi64(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_epi64_to___m512d(mem_addr: *const i64) -> __m512d {
_mm512_castsi512_pd(_mm512_loadu_epi64(mem_addr))
}
// ===
#[inline]
unsafe fn _mm_loadu_epi16_to___m128(mem_addr: *const i16) -> __m128 {
_mm_castsi128_ps(_mm_loadu_epi16(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_epi16_to___m256(mem_addr: *const i16) -> __m256 {
_mm256_castsi256_ps(_mm256_loadu_epi16(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_epi16_to___m512(mem_addr: *const i16) -> __m512 {
_mm512_castsi512_ps(_mm512_loadu_epi16(mem_addr))
}
#[inline]
unsafe fn _mm_loadu_epi32_to___m128(mem_addr: *const i32) -> __m128 {
_mm_castsi128_ps(_mm_loadu_epi32(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_epi32_to___m256(mem_addr: *const i32) -> __m256 {
_mm256_castsi256_ps(_mm256_loadu_epi32(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_epi32_to___m512(mem_addr: *const i32) -> __m512 {
_mm512_castsi512_ps(_mm512_loadu_epi32(mem_addr))
}
#[inline]
unsafe fn _mm_loadu_epi64_to___m128(mem_addr: *const i64) -> __m128 {
_mm_castsi128_ps(_mm_loadu_epi64(mem_addr))
}
#[inline]
unsafe fn _mm256_loadu_epi64_to___m256(mem_addr: *const i64) -> __m256 {
_mm256_castsi256_ps(_mm256_loadu_epi64(mem_addr))
}
#[inline]
unsafe fn _mm512_loadu_epi64_to___m512(mem_addr: *const i64) -> __m512 {
_mm512_castsi512_ps(_mm512_loadu_epi64(mem_addr))
}
#[inline]
fn debug_simd_finish<T: core::fmt::Debug, const N: usize>(
formatter: &mut core::fmt::Formatter<'_>,
type_name: &str,
array: &[T; N],
) -> core::fmt::Result {
core::fmt::Formatter::debug_tuple_fields_finish(
formatter,
type_name,
&core::array::from_fn::<&dyn core::fmt::Debug, N, _>(|i| &array[i]),
)
}
#[repr(transparent)]
struct Hex<T>(T);
impl<T: DebugHexF16> core::fmt::Debug for Hex<T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
<T as DebugHexF16>::fmt(&self.0, f)
}
}
fn debug_f16<T: DebugHexF16>(x: T) -> impl core::fmt::Debug {
Hex(x)
}
trait DebugHexF16 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result;
}
impl DebugHexF16 for f16 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "{:#06x?}", self.to_bits())
}
}
impl DebugHexF16 for __m128h {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 8]>(*self) };
debug_simd_finish(f, "__m128h", &array)
}
}
impl DebugHexF16 for __m128i {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 8]>(*self) };
debug_simd_finish(f, "__m128i", &array)
}
}
impl DebugHexF16 for __m256h {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 16]>(*self) };
debug_simd_finish(f, "__m256h", &array)
}
}
impl DebugHexF16 for __m256i {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 16]>(*self) };
debug_simd_finish(f, "__m256i", &array)
}
}
impl DebugHexF16 for __m512h {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 32]>(*self) };
debug_simd_finish(f, "__m512h", &array)
}
}
impl DebugHexF16 for __m512i {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let array = unsafe { core::mem::transmute::<_, [Hex<f16>; 32]>(*self) };
debug_simd_finish(f, "__m512i", &array)
}
}
trait DebugAs<T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result;
}
impl<T: core::fmt::Display> DebugAs<T> for T {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "{self}")
}
}
macro_rules! impl_debug_as {
($simd:ty, $name:expr, $bits:expr, [$($type:ty),+]) => {
$(
impl DebugAs<$type> for $simd {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
const ELEMENT_BITS: usize = core::mem::size_of::<$type>() * 8;
const NUM_ELEMENTS: usize = $bits / ELEMENT_BITS;
let array = unsafe { core::mem::transmute::<_, [$type; NUM_ELEMENTS]>(*self) };
debug_simd_finish(f, $name, &array)
}
}
)+
};
}
impl_debug_as!(__m128i, "__m128i", 128, [u8, i8, u16, i16, u32, i32, u64, i64]);
impl_debug_as!(__m256i, "__m256i", 256, [u8, i8, u16, i16, u32, i32, u64, i64]);
impl_debug_as!(__m512i, "__m512i", 512, [u8, i8, u16, i16, u32, i32, u64, i64]);
impl_debug_as!(__m128h, "__m128h", 128, [f32]);
impl_debug_as!(__m256h, "__m256h", 256, [f32]);
impl_debug_as!(__m512h, "__m512h", 512, [f32]);
fn debug_as<V, T>(x: V) -> impl core::fmt::Debug
where V: DebugAs<T>
{
struct DebugWrapper<V, T>(V, core::marker::PhantomData<T>);
impl<V: DebugAs<T>, T> core::fmt::Debug for DebugWrapper<V, T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.0.fmt(f)
}
}
DebugWrapper(x, core::marker::PhantomData)
}
"#;
pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#"
#ifndef X86_DECLARATIONS
#define X86_DECLARATIONS
typedef _Float16 float16_t;
typedef float float32_t;
typedef double float64_t;
#define __int64 long long
#define __int32 int
std::ostream& operator<<(std::ostream& os, _Float16 value);
std::ostream& operator<<(std::ostream& os, __m128i value);
std::ostream& operator<<(std::ostream& os, __m256i value);
std::ostream& operator<<(std::ostream& os, __m512i value);
std::ostream& operator<<(std::ostream& os, __mmask8 value);
#define _mm512_extract_intrinsic_test_epi8(m, lane) \
_mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16)
#define _mm512_extract_intrinsic_test_epi16(m, lane) \
_mm_extract_epi16(_mm512_extracti64x2_epi64((m), (lane) / 8), (lane) % 8)
#define _mm512_extract_intrinsic_test_epi32(m, lane) \
_mm_extract_epi32(_mm512_extracti64x2_epi64((m), (lane) / 4), (lane) % 4)
#define _mm512_extract_intrinsic_test_epi64(m, lane) \
_mm_extract_epi64(_mm512_extracti64x2_epi64((m), (lane) / 2), (lane) % 2)
#define _mm64_extract_intrinsic_test_epi8(m, lane) \
((_mm_extract_pi16((m), (lane) / 2) >> (((lane) % 2) * 8)) & 0xFF)
#define _mm64_extract_intrinsic_test_epi32(m, lane) \
_mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32))
// Load f16 (__m128h) and cast to integer (__m128i)
#define _mm_loadu_ph_to___m128i(mem_addr) _mm_castph_si128(_mm_loadu_ph(mem_addr))
#define _mm256_loadu_ph_to___m256i(mem_addr) _mm256_castph_si256(_mm256_loadu_ph(mem_addr))
#define _mm512_loadu_ph_to___m512i(mem_addr) _mm512_castph_si512(_mm512_loadu_ph(mem_addr))
// Load f32 (__m128) and cast to f16 (__m128h)
#define _mm_loadu_ps_to___m128h(mem_addr) _mm_castps_ph(_mm_loadu_ps(mem_addr))
#define _mm256_loadu_ps_to___m256h(mem_addr) _mm256_castps_ph(_mm256_loadu_ps(mem_addr))
#define _mm512_loadu_ps_to___m512h(mem_addr) _mm512_castps_ph(_mm512_loadu_ps(mem_addr))
// Load integer types and cast to double (__m128d, __m256d, __m512d)
#define _mm_loadu_epi16_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr)))
#define _mm256_loadu_epi16_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr)))
#define _mm512_loadu_epi16_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr)))
#define _mm_loadu_epi32_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr)))
#define _mm256_loadu_epi32_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr)))
#define _mm512_loadu_epi32_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr)))
#define _mm_loadu_epi64_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr)))
#define _mm256_loadu_epi64_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr)))
#define _mm512_loadu_epi64_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr)))
// Load integer types and cast to float (__m128, __m256, __m512)
#define _mm_loadu_epi16_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr)))
#define _mm256_loadu_epi16_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr)))
#define _mm512_loadu_epi16_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr)))
#define _mm_loadu_epi32_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr)))
#define _mm256_loadu_epi32_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr)))
#define _mm512_loadu_epi32_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr)))
#define _mm_loadu_epi64_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr)))
#define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr)))
#define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr)))
// T1 is the `To` type, T2 is the `From` type
template<typename T1, typename T2> T1 cast(T2 x) {
if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2>) || (std::is_floating_point_v<T1> && std::is_floating_point_v<T2>)) {
return x;
} else if constexpr (sizeof(T1) <= sizeof(T2)) {
T1 ret{};
std::memcpy(&ret, &x, sizeof(T1));
return ret;
} else {
static_assert(sizeof(T1) == sizeof(T2) || std::is_convertible_v<T2, T1>,
"T2 must either be convertible to T1, or have the same size as T1!");
return T1{};
}
}
#endif
"#;
pub const PLATFORM_C_DEFINITIONS: &str = r#"
std::ostream& operator<<(std::ostream& os, _Float16 value) {
uint16_t temp = 0;
memcpy(&temp, &value, sizeof(_Float16));
std::stringstream ss;
ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp;
os << ss.str();
return os;
}
std::ostream& operator<<(std::ostream& os, __m128i value) {
void* temp = malloc(sizeof(__m128i));
_mm_storeu_si128((__m128i*)temp, value);
std::stringstream ss;
ss << "0x";
for(int i = 0; i < 16; i++) {
ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i];
}
os << ss.str();
return os;
}
std::ostream& operator<<(std::ostream& os, __m256i value) {
void* temp = malloc(sizeof(__m256i));
_mm256_storeu_si256((__m256i*)temp, value);
std::stringstream ss;
ss << "0x";
for(int i = 0; i < 32; i++) {
ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i];
}
os << ss.str();
return os;
}
std::ostream& operator<<(std::ostream& os, __m512i value) {
void* temp = malloc(sizeof(__m512i));
_mm512_storeu_si512((__m512i*)temp, value);
std::stringstream ss;
ss << "0x";
for(int i = 0; i < 64; i++) {
ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i];
}
os << ss.str();
return os;
}
std::ostream& operator<<(std::ostream& os, __mmask8 value) {
os << static_cast<int>(value);
return os;
}
"#;
pub const PLATFORM_RUST_CFGS: &str = r#"
#![cfg_attr(target_arch = "x86", feature(avx))]
#![cfg_attr(target_arch = "x86", feature(sse))]
#![cfg_attr(target_arch = "x86", feature(sse2))]
#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_bf16))]
#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))]
#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))]
#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))]
#![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))]
#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))]
#![feature(fmt_helpers_for_derive)]
"#;

View file

@ -0,0 +1,30 @@
use crate::common::constraint::Constraint;
pub fn map_constraints(imm_type: &String, imm_width: u32) -> Option<Constraint> {
if imm_width > 0 {
let max: i64 = 2i64.pow(imm_width);
return Some(Constraint::Range(0..max));
}
match imm_type.as_str() {
// Legal values for variables of `_MM_FROUND` type are:
// 8 => (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
// 9 => (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
// 10 => (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
// 11 => (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
// 4 => _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
"_MM_FROUND" => Some(Constraint::Set(vec![4, 8, 9, 10, 11])),
"_MM_INDEX_SCALE" => Some(Constraint::Set(vec![1, 2, 4, 8])),
"_MM_CMPINT" => Some(Constraint::Range(0..8)),
"_MM_REDUCE" => Some(Constraint::Range(0..8)),
"_MM_FROUND_SAE" => Some(Constraint::Equal(8)),
"_MM_MANTISSA_NORM" => Some(Constraint::Range(0..4)),
"_MM_MANTISSA_NORM_ENUM" => Some(Constraint::Range(0..4)),
"_MM_MANTISSA_SIGN" => Some(Constraint::Range(0..3)),
"_MM_PERM" => Some(Constraint::Range(0..256)),
"_MM_PERM_ENUM" => Some(Constraint::Range(0..256)),
"_MM_CMPINT_ENUM" => Some(Constraint::Range(0..8)),
"_MM_ROUND_MODE" => Some(Constraint::Set(vec![0, 0x2, 0x4, 0x6])),
"_CMP_" => Some(Constraint::Range(0..32)),
_ => None,
}
}

View file

@ -0,0 +1,23 @@
use crate::common::intrinsic_helpers::IntrinsicType;
use crate::x86::xml_parser::Parameter;
use std::ops::{Deref, DerefMut};
#[derive(Debug, Clone, PartialEq)]
pub struct X86IntrinsicType {
pub data: IntrinsicType,
pub param: Parameter,
}
impl Deref for X86IntrinsicType {
type Target = IntrinsicType;
fn deref(&self) -> &Self::Target {
&self.data
}
}
impl DerefMut for X86IntrinsicType {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.data
}
}

View file

@ -0,0 +1,76 @@
mod compile;
mod config;
mod constraint;
mod intrinsic;
mod types;
mod xml_parser;
use crate::common::SupportedArchitectureTest;
use crate::common::cli::ProcessedCli;
use crate::common::compile_c::CppCompilation;
use crate::common::intrinsic::Intrinsic;
use crate::common::intrinsic_helpers::TypeKind;
use intrinsic::X86IntrinsicType;
use itertools::Itertools;
use xml_parser::get_xml_intrinsics;
pub struct X86ArchitectureTest {
intrinsics: Vec<Intrinsic<X86IntrinsicType>>,
cli_options: ProcessedCli,
}
impl SupportedArchitectureTest for X86ArchitectureTest {
type IntrinsicImpl = X86IntrinsicType;
fn cli_options(&self) -> &ProcessedCli {
&self.cli_options
}
fn intrinsics(&self) -> &[Intrinsic<X86IntrinsicType>] {
&self.intrinsics
}
fn cpp_compilation(&self) -> Option<CppCompilation> {
compile::build_cpp_compilation(&self.cli_options)
}
const NOTICE: &str = config::NOTICE;
const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h", "cstddef", "cstdint"];
const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS;
const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS;
const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS;
const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS;
fn create(cli_options: ProcessedCli) -> Self {
let intrinsics =
get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file");
let sample_percentage: usize = cli_options.sample_percentage as usize;
let mut intrinsics = intrinsics
.into_iter()
// Not sure how we would compare intrinsic that returns void.
.filter(|i| i.results.kind() != TypeKind::Void)
.filter(|i| i.results.kind() != TypeKind::BFloat)
.filter(|i| i.arguments.args.len() > 0)
.filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat))
// Skip pointers for now, we would probably need to look at the return
// type to work out how many elements we need to point to.
.filter(|i| !i.arguments.iter().any(|a| a.is_ptr()))
.filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128))
.filter(|i| !cli_options.skip.contains(&i.name))
.unique_by(|i| i.name.clone())
.collect::<Vec<_>>();
let sample_size = (intrinsics.len() * sample_percentage) / 100;
intrinsics.truncate(sample_size);
intrinsics.sort_by(|a, b| a.name.cmp(&b.name));
Self {
intrinsics: intrinsics,
cli_options: cli_options,
}
}
}

View file

@ -0,0 +1,485 @@
use std::str::FromStr;
use itertools::Itertools;
use regex::Regex;
use super::intrinsic::X86IntrinsicType;
use crate::common::cli::Language;
use crate::common::indentation::Indentation;
use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind};
use crate::x86::xml_parser::Parameter;
impl IntrinsicTypeDefinition for X86IntrinsicType {
/// Gets a string containing the type in C format.
/// This function assumes that this value is present in the metadata hashmap.
fn c_type(&self) -> String {
self.param
.type_data
.replace("unsigned __int64", "uint64_t")
.replace("unsigned __int32", "uint32_t")
.replace("unsigned __int16", "uint16_t")
.replace("unsigned __int8", "uint8_t")
.replace("__int64", "int64_t")
.replace("__int32", "int32_t")
.replace("__int16", "int16_t")
.replace("__int8", "int8_t")
.replace("const ", "")
}
fn c_single_vector_type(&self) -> String {
// matches __m128, __m256 and similar types
let re = Regex::new(r"__m\d+").unwrap();
if re.is_match(self.param.type_data.as_str()) {
self.param.type_data.clone()
} else {
unreachable!("Shouldn't be called on this type")
}
}
// fn rust_type(&self) -> String {
// // handling edge cases first
// // the general handling is implemented below
// if let Some(val) = self.metadata.get("type") {
// match val.as_str() {
// "__m128 const *" => {
// return "&__m128".to_string();
// }
// "__m128d const *" => {
// return "&__m128d".to_string();
// }
// "const void*" => {
// return "&__m128d".to_string();
// }
// _ => {}
// }
// }
// if self.kind() == TypeKind::Void && self.ptr {
// // this has been handled by default settings in
// // the from_param function of X86IntrinsicType
// unreachable!()
// }
// // general handling cases
// let core_part = if self.kind() == TypeKind::Mask {
// // all types of __mmask<int> are handled here
// format!("__mask{}", self.bit_len.unwrap())
// } else if self.simd_len.is_some() {
// // all types of __m<int> vector types are handled here
// let re = Regex::new(r"\__m\d+[a-z]*").unwrap();
// let rust_type = self
// .metadata
// .get("type")
// .map(|val| re.find(val).unwrap().as_str());
// rust_type.unwrap().to_string()
// } else {
// format!(
// "{}{}",
// self.kind.rust_prefix().to_string(),
// self.bit_len.unwrap()
// )
// };
// // extracting "memsize" so that even vector types can be involved
// let memwidth = self
// .metadata
// .get("memwidth")
// .map(|n| str::parse::<u32>(n).unwrap());
// let prefix_part = if self.ptr && self.constant && self.bit_len.eq(&memwidth) {
// "&"
// } else if self.ptr && self.bit_len.eq(&memwidth) {
// "&mut "
// } else if self.ptr && self.constant {
// "*const "
// } else if self.ptr {
// "*mut "
// } else {
// ""
// };
// return prefix_part.to_string() + core_part.as_str();
// }
/// Determines the load function for this type.
fn get_load_function(&self, _language: Language) -> String {
let type_value = self.param.type_data.clone();
if type_value.len() == 0 {
unimplemented!("the value for key 'type' is not present!");
}
if type_value.starts_with("__mmask") {
// no need of loads, since they work directly
// with hex constants
String::from("*")
} else if type_value.starts_with("__m") {
// the structure is like the follows:
// if "type" starts with __m<num>{h/i/<null>},
// then use either _mm_set1_epi64,
// _mm256_set1_epi64 or _mm512_set1_epi64
if type_value.contains("__m64") {
return String::from("*(__m64*)");
}
let type_val_filtered = type_value
.chars()
.filter(|c| c.is_numeric())
.join("")
.replace("128", "")
.replace("64", "");
{
let suffix = match (self.bit_len, self.kind) {
(Some(16), TypeKind::Float)
if ["__m128i", "__m256i", "__m512i"]
.contains(&self.param.type_data.as_str()) =>
{
format!("ph_to_{}", self.param.type_data)
}
(Some(32), TypeKind::Float)
if ["__m128h", "__m256h", "__m512h"]
.contains(&self.param.type_data.as_str()) =>
{
format!("ps_to_{}", self.param.type_data)
}
(Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask)
if ["__m128d", "__m256d", "__m512d"]
.contains(&self.param.type_data.as_str()) =>
{
format!("epi{bit_len}_to_{}", self.param.type_data)
}
(Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask)
if ["__m128", "__m256", "__m512"]
.contains(&self.param.type_data.as_str()) =>
{
format!("epi{bit_len}_to_{}", self.param.type_data)
}
(Some(bit_len @ (8 | 16 | 32 | 64)), TypeKind::Int(_)) => {
format!("epi{bit_len}")
}
(Some(bit_len), TypeKind::Mask) => format!("epi{bit_len}"),
(Some(16), TypeKind::Float) => format!("ph"),
(Some(32), TypeKind::Float) => format!("ps"),
(Some(64), TypeKind::Float) => format!("pd"),
(Some(128 | 256 | 512), TypeKind::Vector) => format!("epi32"),
_ => unreachable!("Invalid element type for a vector type! {:?}", self.param),
};
format!("_mm{type_val_filtered}_loadu_{suffix}")
}
} else {
// if it is a pointer, then rely on type conversion
// If it is not any of the above type (__int<num>, __bfloat16, unsigned short, etc)
// then typecast it.
format!("({type_value})")
}
}
/// Generates a std::cout for the intrinsics results that will match the
/// rust debug output format for the return type. The generated line assumes
/// there is an int i in scope which is the current pass number.
fn print_result_c(&self, indentation: Indentation, additional: &str) -> String {
let lanes = if self.num_vectors() > 1 {
(0..self.num_vectors())
.map(|vector| {
format!(
r#""{ty}(" << {lanes} << ")""#,
ty = self.c_single_vector_type(),
lanes = (0..self.num_lanes())
.map(move |idx| -> std::string::String {
format!(
"{cast}{lane_fn}(__return_value.val[{vector}], {lane})",
cast = self.generate_final_type_cast(),
lane_fn = self.get_lane_function(),
lane = idx,
vector = vector,
)
})
.collect::<Vec<_>>()
.join(r#" << ", " << "#)
)
})
.collect::<Vec<_>>()
.join(r#" << ", " << "#)
} else if self.num_lanes() > 1 {
(0..self.num_lanes())
.map(|idx| -> std::string::String {
let cast_type = self.c_promotion();
let lane_fn = self.get_lane_function();
if cast_type.len() > 2 {
format!("cast<{cast_type}>({lane_fn}(__return_value, {idx}))")
} else {
format!("{lane_fn}(__return_value, {idx})")
}
})
.collect::<Vec<_>>()
.join(r#" << ", " << "#)
} else {
format!(
"{promote}cast<{cast}>(__return_value)",
cast = match self.kind() {
TypeKind::Void => "void".to_string(),
TypeKind::Float if self.inner_size() == 64 => "double".to_string(),
TypeKind::Float if self.inner_size() == 32 => "float".to_string(),
TypeKind::Mask => format!(
"__mmask{}",
self.bit_len.expect(format!("self: {self:#?}").as_str())
),
TypeKind::Vector => format!(
"__m{}i",
self.bit_len.expect(format!("self: {self:#?}").as_str())
),
_ => self.c_scalar_type(),
},
promote = self.generate_final_type_cast(),
)
};
format!(
r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#,
ty = if self.is_simd() {
format!("{}(", self.c_type())
} else {
String::from("")
},
close = if self.is_simd() { ")" } else { "" },
)
}
/// Determines the get lane function for this type.
fn get_lane_function(&self) -> String {
let total_vector_bits: Option<u32> = self
.simd_len
.zip(self.bit_len)
.and_then(|(simd_len, bit_len)| Some(simd_len * bit_len));
match (self.bit_len, total_vector_bits) {
(Some(8), Some(128)) => String::from("(uint8_t)_mm_extract_epi8"),
(Some(16), Some(128)) => String::from("(uint16_t)_mm_extract_epi16"),
(Some(32), Some(128)) => String::from("(uint32_t)_mm_extract_epi32"),
(Some(64), Some(128)) => String::from("(uint64_t)_mm_extract_epi64"),
(Some(8), Some(256)) => String::from("(uint8_t)_mm256_extract_epi8"),
(Some(16), Some(256)) => String::from("(uint16_t)_mm256_extract_epi16"),
(Some(32), Some(256)) => String::from("(uint32_t)_mm256_extract_epi32"),
(Some(64), Some(256)) => String::from("(uint64_t)_mm256_extract_epi64"),
(Some(8), Some(512)) => String::from("(uint8_t)_mm512_extract_intrinsic_test_epi8"),
(Some(16), Some(512)) => String::from("(uint16_t)_mm512_extract_intrinsic_test_epi16"),
(Some(32), Some(512)) => String::from("(uint32_t)_mm512_extract_intrinsic_test_epi32"),
(Some(64), Some(512)) => String::from("(uint64_t)_mm512_extract_intrinsic_test_epi64"),
(Some(8), Some(64)) => String::from("(uint8_t)_mm64_extract_intrinsic_test_epi8"),
(Some(16), Some(64)) => String::from("(uint16_t)_mm_extract_pi16"),
(Some(32), Some(64)) => String::from("(uint32_t)_mm64_extract_intrinsic_test_epi32"),
_ => unreachable!(
"invalid length for vector argument: {:?}, {:?}",
self.bit_len, self.simd_len
),
}
}
fn rust_scalar_type(&self) -> String {
let prefix = match self.data.kind {
TypeKind::Mask => String::from("__mmask"),
TypeKind::Vector => String::from("i"),
_ => self.kind().rust_prefix().to_string(),
};
let bits = if self.inner_size() >= 128 {
32
} else {
self.inner_size()
};
format!("{prefix}{bits}")
}
fn print_result_rust(&self) -> String {
let return_value = match self.kind() {
TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)".to_string(),
TypeKind::Float
if self.inner_size() == 32
&& ["__m512h"].contains(&self.param.type_data.as_str()) =>
{
"debug_as::<_, f32>(__return_value)".to_string()
}
TypeKind::Int(_)
if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) =>
{
format!("debug_as::<_, u{}>(__return_value)", self.inner_size())
}
_ => "format_args!(\"{__return_value:.150?}\")".to_string(),
};
return_value
}
}
impl X86IntrinsicType {
fn from_c(s: &str) -> Result<IntrinsicType, String> {
let mut s_copy = s.to_string();
s_copy = s_copy
.replace("*", "")
.replace("_", "")
.replace("constexpr", "")
.replace("const", "")
.replace("literal", "");
let s_split = s_copy
.split(" ")
.filter_map(|s| if s.len() == 0 { None } else { Some(s) })
.last();
let s_split = s_split.map(|s| s.chars().filter(|c| !c.is_numeric()).join(""));
// TODO: make the unwrapping safe
let kind = TypeKind::from_str(s_split.unwrap().trim()).unwrap_or(TypeKind::Void);
let kind = if s.find("unsigned").is_some() {
match kind {
TypeKind::Int(_) => TypeKind::Int(Sign::Unsigned),
TypeKind::Char(_) => TypeKind::Char(Sign::Unsigned),
a => a,
}
} else {
kind
};
let ptr_constant = false;
let constant = s.matches("const").next().is_some();
let ptr = s.matches("*").next().is_some();
Ok(IntrinsicType {
ptr,
ptr_constant,
constant,
kind,
bit_len: None,
simd_len: None,
vec_len: None,
})
}
pub fn update_simd_len(&mut self) {
let mut type_processed = self.param.type_data.clone();
type_processed.retain(|c| c.is_numeric());
// check the param.type and extract numeric part if there are double
// underscores. divide this number with bit-len and set this as simd-len.
// Only __m<int> types can have a simd-len.
if self.param.type_data.contains("__m") && !self.param.type_data.contains("__mmask") {
self.data.simd_len = match str::parse::<u32>(type_processed.as_str()) {
// If bit_len is None, simd_len will be None.
// Else simd_len will be (num_bits / bit_len).
Ok(num_bits) => self
.data
.bit_len
.and_then(|bit_len| Some(num_bits / bit_len)),
Err(_) => None,
};
}
}
pub fn from_param(param: &Parameter) -> Result<Self, String> {
match Self::from_c(param.type_data.as_str()) {
Err(message) => Err(message),
Ok(mut data) => {
// First correct the type of the parameter using param.etype.
// The assumption is that the parameter of type void may have param.type
// as "__m128i", "__mmask8" and the like.
if !param.etype.is_empty() {
match TypeKind::from_str(param.etype.as_str()) {
Ok(value) => {
data.kind = value;
}
Err(_) => {}
};
}
// check for param.etype.
// extract the numeric part and set as bit-len
// If param.etype is not present, guess the default bit-len
let mut etype_processed = param.etype.clone();
etype_processed.retain(|c| c.is_numeric());
let mut type_processed = param.type_data.clone();
type_processed.retain(|c| c.is_numeric());
match str::parse::<u32>(etype_processed.as_str()) {
Ok(value) => data.bit_len = Some(value),
Err(_) => {
data.bit_len = match data.kind() {
TypeKind::Char(_) => Some(8),
TypeKind::BFloat => Some(16),
TypeKind::Int(_) => Some(32),
TypeKind::Float => Some(32),
_ => None,
};
}
}
if param.type_data.contains("__mmask") {
data.bit_len = str::parse::<u32>(type_processed.as_str()).ok();
}
if vec!["M512", "M256", "M128"].contains(&param.etype.as_str()) {
match param.type_data.chars().last() {
Some('i') => {
data.kind = TypeKind::Int(Sign::Signed);
data.bit_len = Some(32);
}
Some('h') => {
data.kind = TypeKind::Float;
data.bit_len = Some(16);
}
Some('d') => {
data.kind = TypeKind::Float;
data.bit_len = Some(64);
}
_ => (),
}
}
// default settings for "void *" parameters
// often used by intrinsics to denote memory address or so.
if data.kind == TypeKind::Void && data.ptr {
data.kind = TypeKind::Int(Sign::Unsigned);
data.bit_len = Some(8);
}
// default settings for "void *" parameters
// often used by intrinsics to denote memory address or so.
if data.kind == TypeKind::Mask && data.bit_len.is_none() {
data.bit_len = Some(32);
}
if param.etype == "IMM" || param.imm_width > 0 || param.imm_type.len() > 0 {
data.kind = TypeKind::Int(Sign::Unsigned);
data.constant = true;
}
// Rust defaults to signed variants, unless they are explicitly mentioned
// the `type` field are C++ types.
if data.kind == TypeKind::Int(Sign::Unsigned)
&& !(param.type_data.contains("unsigned") || param.type_data.contains("uint"))
{
data.kind = TypeKind::Int(Sign::Signed)
}
// default settings for IMM parameters
if param.etype == "IMM" {
data.bit_len = if param.imm_width > 0 {
Some(param.imm_width)
} else {
Some(8)
}
}
let mut result = X86IntrinsicType {
data,
param: param.clone(),
};
result.update_simd_len();
Ok(result)
}
}
// Tile types won't currently reach here, since the intrinsic that involve them
// often return "null" type. Such intrinsics are not tested in `intrinsic-test`
// currently and are filtered out at `mod.rs`.
}
}

View file

@ -0,0 +1,139 @@
use crate::common::argument::{Argument, ArgumentList};
use crate::common::intrinsic::Intrinsic;
use crate::common::intrinsic_helpers::TypeKind;
use crate::x86::constraint::map_constraints;
use regex::Regex;
use serde::{Deserialize, Deserializer};
use std::path::Path;
use super::intrinsic::X86IntrinsicType;
// Custom deserializer function to convert strings to u32
fn string_to_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
return s.as_str().parse::<u32>().or(Ok(0u32));
}
#[derive(Deserialize)]
struct Data {
#[serde(rename = "intrinsic", default)]
intrinsics: Vec<XMLIntrinsic>,
}
#[derive(Deserialize)]
struct XMLIntrinsic {
#[serde(rename = "return")]
pub return_data: Parameter,
#[serde(rename = "@name")]
pub name: String,
// #[serde(rename = "@tech")]
// tech: String,
#[serde(rename = "CPUID", default)]
cpuid: Vec<String>,
#[serde(rename = "parameter", default)]
parameters: Vec<Parameter>,
}
#[derive(Debug, PartialEq, Clone, Deserialize)]
pub struct Parameter {
#[serde(rename = "@varname", default)]
pub var_name: String,
#[serde(rename = "@type", default)]
pub type_data: String,
#[serde(rename = "@etype", default)]
pub etype: String,
#[serde(rename = "@memwidth", default, deserialize_with = "string_to_u32")]
pub memwidth: u32,
#[serde(rename = "@immwidth", default, deserialize_with = "string_to_u32")]
pub imm_width: u32,
#[serde(rename = "@immtype", default)]
pub imm_type: String,
}
pub fn get_xml_intrinsics(
filename: &Path,
) -> Result<Vec<Intrinsic<X86IntrinsicType>>, Box<dyn std::error::Error>> {
let file = std::fs::File::open(filename)?;
let reader = std::io::BufReader::new(file);
let data: Data =
quick_xml::de::from_reader(reader).expect("failed to deserialize the source XML file");
let parsed_intrinsics: Vec<Intrinsic<X86IntrinsicType>> = data
.intrinsics
.into_iter()
.filter_map(|intr| {
// Some(xml_to_intrinsic(intr, target).expect("Couldn't parse XML properly!"))
xml_to_intrinsic(intr).ok()
})
.collect();
Ok(parsed_intrinsics)
}
fn xml_to_intrinsic(
intr: XMLIntrinsic,
) -> Result<Intrinsic<X86IntrinsicType>, Box<dyn std::error::Error>> {
let name = intr.name;
let result = X86IntrinsicType::from_param(&intr.return_data);
let args_check = intr.parameters.into_iter().enumerate().map(|(i, param)| {
let ty = X86IntrinsicType::from_param(&param);
if ty.is_err() {
None
} else {
let effective_imm_width = if name == "_mm_mpsadbw_epu8" && param.var_name == "imm8" {
3
} else {
param.imm_width
};
let constraint = map_constraints(&param.imm_type, effective_imm_width);
let arg = Argument::<X86IntrinsicType>::new(
i,
param.var_name.clone(),
ty.unwrap(),
constraint,
);
Some(arg)
}
});
let args = args_check.collect::<Vec<_>>();
if args.iter().any(|elem| elem.is_none()) {
return Err(Box::from("intrinsic isn't fully supported in this test!"));
}
let mut args = args
.into_iter()
.map(|e| e.unwrap())
.filter(|arg| arg.ty.ptr || arg.ty.kind != TypeKind::Void)
.collect::<Vec<_>>();
let mut args_test = args.iter();
// if one of the args has etype="MASK" and type="__m<int>d",
// then set the bit_len and simd_len accordingly
let re = Regex::new(r"__m\d+").unwrap();
let is_mask = |arg: &Argument<X86IntrinsicType>| arg.ty.param.etype.as_str() == "MASK";
let is_vector = |arg: &Argument<X86IntrinsicType>| re.is_match(arg.ty.param.type_data.as_str());
let pos = args_test.position(|arg| is_mask(arg) && is_vector(arg));
if let Some(index) = pos {
args[index].ty.bit_len = args[0].ty.bit_len;
}
args.iter_mut().for_each(|arg| arg.ty.update_simd_len());
let arguments = ArgumentList::<X86IntrinsicType> { args };
if let Err(message) = result {
return Err(Box::from(message));
}
Ok(Intrinsic {
name,
arguments,
results: result.unwrap(),
arch_tags: intr.cpuid,
})
}

View file

@ -13,10 +13,6 @@ auto_llvm_sign_conversion: false
neon-stable: &neon-stable
FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
# #[cfg(not(target_arch = "arm"))]
target-not-arm: &target-not-arm
FnCall: [cfg, [{ FnCall: [not, ['target_arch = "arm"']]}]]
# #[cfg(not(target_arch = "arm64ec"))]
target-not-arm64ec: &target-not-arm64ec
FnCall: [cfg, [{ FnCall: [not, ['target_arch = "arm64ec"']]}]]
@ -63,6 +59,9 @@ neon-unstable-f16: &neon-unstable-f16
neon-unstable-feat-lut: &neon-unstable-feat-lut
FnCall: [unstable, ['feature = "stdarch_neon_feat_lut"', 'issue = "138050"']]
aarch64-unstable-jscvt: &aarch64-unstable-jscvt
FnCall: [unstable, ['feature = "stdarch_aarch64_jscvt"', 'issue = "147555"']]
# #[cfg(target_endian = "little")]
little-endian: &little-endian
FnCall: [cfg, ['target_endian = "little"']]
@ -8781,7 +8780,6 @@ intrinsics:
- [float64x1_t, float32x2_t]
- [float32x4_t, float64x2_t]
- [float64x2_t, float32x4_t]
big_endian_inverse: false
compose:
- FnCall: [transmute, [a]]
@ -8802,7 +8800,6 @@ intrinsics:
# q
- [float64x2_t, float16x8_t]
- [float16x8_t, float64x2_t]
big_endian_inverse: false
compose:
- FnCall: [transmute, [a]]
@ -13082,7 +13079,6 @@ intrinsics:
return_type: "{type[0]}"
attr:
- FnCall: [target_feature, ['enable = "crc"']]
- *target-not-arm
- FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32x"]] }]]
- *aarch64-crc-stable
safety: safe
@ -13104,7 +13100,6 @@ intrinsics:
return_type: "{type[0]}"
attr:
- FnCall: [target_feature, ['enable = "crc"']]
- *target-not-arm
- FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["crc32cx"]] }]]
- *aarch64-crc-stable
safety: safe
@ -14267,3 +14262,21 @@ intrinsics:
- 'vluti4q_laneq_{neon_type[5]}_x2::<LANE>'
- - FnCall: [transmute, [a]]
- b
- name: "__jcvt"
doc: "Floating-point JavaScript convert to signed fixed-point, rounding toward zero"
arguments: ["a: {type}"]
return_type: "i32"
attr:
- FnCall: [target_feature, ['enable = "jsconv"']]
- FnCall: [cfg_attr, [test, { FnCall: [assert_instr, ["fjcvtzs"]] }]]
- *aarch64-unstable-jscvt
safety: safe
types:
- f64
compose:
- LLVMLink:
name: "fjcvtzs"
links:
- link: "llvm.aarch64.fjcvtzs"
arch: aarch64,arm64ec

View file

@ -8480,7 +8480,6 @@ intrinsics:
- [poly16x8_t, p128]
- [int8x16_t, p128]
- [uint8x16_t, p128]
big_endian_inverse: false
compose:
- FnCall: [transmute, [a]]
@ -8718,7 +8717,6 @@ intrinsics:
- [poly8x16_t, float32x4_t]
- [poly16x8_t, float32x4_t]
- [p128, float32x4_t]
big_endian_inverse: false
compose:
- FnCall: [transmute, [a]]
@ -8782,7 +8780,6 @@ intrinsics:
- [float16x8_t, uint16x8_t]
- [float16x8_t, uint32x4_t]
- [float16x8_t, uint64x2_t]
big_endian_inverse: false
compose:
- FnCall: [transmute, [a]]
@ -8807,7 +8804,6 @@ intrinsics:
- [poly128_t, float16x8_t]
- [float16x8_t, poly128_t]
- [float16x8_t, poly64x2_t]
big_endian_inverse: false
compose:
- FnCall: [transmute, [a]]

View file

@ -164,7 +164,7 @@ fn verify_all_signatures() {
// Open up the network console and you'll see an xml file was downloaded
// (currently called data-3.6.9.xml). That's the file we downloaded
// here.
let xml = include_bytes!("../x86-intel.xml");
let xml = include_bytes!("../../../intrinsics_data/x86-intel.xml");
let xml = &xml[..];
let data: Data = quick_xml::de::from_reader(xml).expect("failed to deserialize xml");

View file

@ -119753,5 +119753,28 @@
"LUTI4"
]
]
},
{
"SIMD_ISA": "Neon",
"name": "__jcvt",
"arguments": [
"float64_t a"
],
"return_type": {
"value": "int32_t"
},
"Arguments_Preparation": {
"a": {
"register": "Dn"
}
},
"Architectures": [
"A64"
],
"instructions": [
[
"FJCVTZS"
]
]
}
]

View file

@ -1 +1 @@
32e7a4b92b109c24e9822c862a7c74436b50e564
73e6c9ebd9123154a196300ef58e30ec8928e74e

View file

@ -217,50 +217,6 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
)?;
}
}
// Used to implement the _mm256_permute2f128_ps, _mm256_permute2f128_pd and
// _mm256_permute2f128_si256 functions. Regardless of the suffix in the name
// thay all can be considered to operate on vectors of 128-bit elements.
// For each 128-bit element of `dest`, copies one from `left`, `right` or
// zero, according to `imm`.
"vperm2f128.ps.256" | "vperm2f128.pd.256" | "vperm2f128.si.256" => {
let [left, right, imm] =
this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
assert_eq!(dest.layout, left.layout);
assert_eq!(dest.layout, right.layout);
assert_eq!(dest.layout.size.bits(), 256);
// Transmute to `[u128; 2]` to process each 128-bit chunk independently.
let u128x2_layout =
this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u128, 2))?;
let left = left.transmute(u128x2_layout, this)?;
let right = right.transmute(u128x2_layout, this)?;
let dest = dest.transmute(u128x2_layout, this)?;
let imm = this.read_scalar(imm)?.to_u8()?;
for i in 0..2 {
let dest = this.project_index(&dest, i)?;
let imm = match i {
0 => imm & 0xF,
1 => imm >> 4,
_ => unreachable!(),
};
if imm & 0b100 != 0 {
this.write_scalar(Scalar::from_u128(0), &dest)?;
} else {
let src = match imm {
0b00 => this.project_index(&left, 0)?,
0b01 => this.project_index(&left, 1)?,
0b10 => this.project_index(&right, 0)?,
0b11 => this.project_index(&right, 1)?,
_ => unreachable!(),
};
this.copy_op(&src, &dest)?;
}
}
}
// Used to implement the _mm_maskload_ps, _mm_maskload_pd, _mm256_maskload_ps
// and _mm256_maskload_pd functions.
// For the element `i`, if the high bit of the `i`-th element of `mask`

View file

@ -829,15 +829,16 @@ unsafe fn test_avx() {
#[target_feature(enable = "avx")]
unsafe fn test_mm256_permute2f128_ps() {
let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
let r = _mm256_permute2f128_ps::<0x13>(a, b);
let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
assert_eq_m256(r, e);
let r = _mm256_permute2f128_ps::<0x44>(a, b);
let e = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
assert_eq_m256(r, e);
// Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
assert_eq_m256(r, z);
}
test_mm256_permute2f128_ps();
@ -845,11 +846,12 @@ unsafe fn test_avx() {
unsafe fn test_mm256_permute2f128_pd() {
let a = _mm256_setr_pd(1., 2., 3., 4.);
let b = _mm256_setr_pd(5., 6., 7., 8.);
let r = _mm256_permute2f128_pd::<0x31>(a, b);
let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
let e = _mm256_setr_pd(3., 4., 7., 8.);
assert_eq_m256d(r, e);
let r = _mm256_permute2f128_pd::<0x44>(a, b);
// Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
assert_eq_m256d(r, e);
}
@ -857,13 +859,14 @@ unsafe fn test_avx() {
#[target_feature(enable = "avx")]
unsafe fn test_mm256_permute2f128_si256() {
let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
let r = _mm256_permute2f128_si256::<0x20>(a, b);
let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
assert_eq_m256i(r, e);
let r = _mm256_permute2f128_si256::<0x44>(a, b);
// Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
assert_eq_m256i(r, e);
}

View file

@ -9,10 +9,5 @@ use std::arch::x86_64::*;
fn main() {
assert!(!is_x86_feature_detected!("sse2"));
unsafe {
// This is a SSE2 intrinsic, but it behaves as a no-op when SSE2
// is not available, so it is always safe to call.
_mm_pause();
}
_mm_pause();
}

View file

@ -54,7 +54,7 @@ unsafe fn test_sse2() {
}
fn test_mm_pause() {
unsafe { _mm_pause() }
_mm_pause()
}
test_mm_pause();

View file

@ -7,17 +7,13 @@ mod x86 {
fn adc(c_in: u8, a: u32, b: u32) -> (u8, u32) {
let mut sum = 0;
// SAFETY: There are no safety requirements for calling `_addcarry_u32`.
// It's just unsafe for API consistency with other intrinsics.
let c_out = unsafe { arch::_addcarry_u32(c_in, a, b, &mut sum) };
let c_out = arch::_addcarry_u32(c_in, a, b, &mut sum);
(c_out, sum)
}
fn sbb(b_in: u8, a: u32, b: u32) -> (u8, u32) {
let mut sum = 0;
// SAFETY: There are no safety requirements for calling `_subborrow_u32`.
// It's just unsafe for API consistency with other intrinsics.
let b_out = unsafe { arch::_subborrow_u32(b_in, a, b, &mut sum) };
let b_out = arch::_subborrow_u32(b_in, a, b, &mut sum);
(b_out, sum)
}
@ -52,17 +48,13 @@ mod x86_64 {
fn adc(c_in: u8, a: u64, b: u64) -> (u8, u64) {
let mut sum = 0;
// SAFETY: There are no safety requirements for calling `_addcarry_u64`.
// It's just unsafe for API consistency with other intrinsics.
let c_out = unsafe { arch::_addcarry_u64(c_in, a, b, &mut sum) };
let c_out = arch::_addcarry_u64(c_in, a, b, &mut sum);
(c_out, sum)
}
fn sbb(b_in: u8, a: u64, b: u64) -> (u8, u64) {
let mut sum = 0;
// SAFETY: There are no safety requirements for calling `_subborrow_u64`.
// It's just unsafe for API consistency with other intrinsics.
let b_out = unsafe { arch::_subborrow_u64(b_in, a, b, &mut sum) };
let b_out = arch::_subborrow_u64(b_in, a, b, &mut sum);
(b_out, sum)
}