x86: add unsafe to all x86 vendor intrinsics

Also, add missing assert_instr tests to each intrinsic, where possible.
This commit is contained in:
Andrew Gallant 2017-09-26 21:53:50 -04:00
parent ff9e960628
commit 6dfc65289c
12 changed files with 1611 additions and 1213 deletions

View file

@ -24,9 +24,11 @@ mod example {
haystack.resize(16, 0);
let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));
vendor::_mm_cmpestri(
vneedle, needle_len as i32, vhaystack, hay_len as i32,
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
unsafe {
vendor::_mm_cmpestri(
vneedle, needle_len as i32, vhaystack, hay_len as i32,
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
}
}
pub fn main() {

View file

@ -19,7 +19,7 @@ use stdsimd_test::assert_instr;
#[inline(always)]
#[target_feature = "+lzcnt"]
#[cfg_attr(test, assert_instr(lzcnt))]
pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
/// Counts the leading most significant zero bits.
///
@ -27,19 +27,19 @@ pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
#[inline(always)]
#[target_feature = "+lzcnt"]
#[cfg_attr(test, assert_instr(lzcnt))]
pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }
/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
#[cfg(test)]
mod tests {
@ -49,21 +49,21 @@ mod tests {
#[simd_test = "lzcnt"]
fn _lzcnt_u32() {
assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
assert_eq!(unsafe { abm::_lzcnt_u32(0b0101_1010u32) }, 25u32);
}
#[simd_test = "lzcnt"]
fn _lzcnt_u64() {
assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
assert_eq!(unsafe { abm::_lzcnt_u64(0b0101_1010u64) }, 57u64);
}
#[simd_test = "popcnt"]
fn _popcnt32() {
assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
assert_eq!(unsafe { abm::_popcnt32(0b0101_1010u32) }, 4);
}
#[simd_test = "popcnt"]
fn _popcnt64() {
assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
assert_eq!(unsafe { abm::_popcnt64(0b0101_1010u64) }, 4);
}
}

View file

@ -1,14 +1,14 @@
use v256::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
use v256::*;
/// Add packed double-precision (64-bit) floating-point elements
/// in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddpd))]
pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
a + b
}
@ -16,7 +16,7 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddps))]
pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
a + b
}
@ -25,7 +25,7 @@ pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmulpd))]
pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
a * b
}
@ -33,7 +33,7 @@ pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vmulps))]
pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
pub unsafe fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
a * b
}
@ -42,8 +42,8 @@ pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddsubpd))]
pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
unsafe { addsubpd256(a, b) }
pub unsafe fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
addsubpd256(a, b)
}
/// Alternatively add and subtract packed single-precision (32-bit)
@ -51,8 +51,8 @@ pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vaddsubps))]
pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
unsafe { addsubps256(a, b) }
pub unsafe fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
addsubps256(a, b)
}
/// Subtract packed double-precision (64-bit) floating-point elements in `b`
@ -60,7 +60,7 @@ pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vsubpd))]
pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
pub unsafe fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
a - b
}
@ -69,25 +69,24 @@ pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vsubps))]
pub fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
pub unsafe fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
a - b
}
/// Round packed double-precision (64-bit) floating point elements in `a`
/// according to the flag `b`. The value of `b` may be as follows:
///
/// ```ignore
/// 0x00: Round to the nearest whole number.
/// 0x01: Round down, toward negative infinity.
/// 0x02: Round up, toward positive infinity.
/// 0x03: Truncate the values.
/// For a few additional values options, check the LLVM docs:
/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
/// ```
#[inline(always)]
#[target_feature = "+avx"]
pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
macro_rules! call {
($imm8:expr) => {
unsafe { roundpd256(a, $imm8) }
}
($imm8:expr) => { roundpd256(a, $imm8) }
}
constify_imm8!(b, call)
}
@ -96,7 +95,7 @@ pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
#[cfg_attr(test, assert_instr(vroundpd))]
#[target_feature = "+avx"]
fn test_mm256_round_pd(a: f64x4) -> f64x4 {
_mm256_round_pd(a, 0x3)
unsafe { _mm256_round_pd(a, 0x3) }
}
/// Round packed double-precision (64-bit) floating point elements in `a` toward
@ -104,8 +103,8 @@ fn test_mm256_round_pd(a: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundpd))]
pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
unsafe { roundpd256(a, 0x02) }
pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
roundpd256(a, 0x02)
}
/// Round packed double-precision (64-bit) floating point elements in `a` toward
@ -113,8 +112,8 @@ pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vroundpd))]
pub fn _mm256_floor_pd(a: f64x4) -> f64x4 {
unsafe { roundpd256(a, 0x01) }
pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
roundpd256(a, 0x01)
}
/// LLVM intrinsics used in the above functions
@ -139,7 +138,7 @@ mod tests {
fn _mm256_add_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_add_pd(a, b);
let r = unsafe { avx::_mm256_add_pd(a, b) };
let e = f64x4::new(6.0, 8.0, 10.0, 12.0);
assert_eq!(r, e);
}
@ -148,7 +147,7 @@ mod tests {
fn _mm256_add_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
let r = avx::_mm256_add_ps(a, b);
let r = unsafe { avx::_mm256_add_ps(a, b) };
let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0);
assert_eq!(r, e);
}
@ -157,7 +156,7 @@ mod tests {
fn _mm256_mul_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_mul_pd(a, b);
let r = unsafe { avx::_mm256_mul_pd(a, b) };
let e = f64x4::new(5.0, 12.0, 21.0, 32.0);
assert_eq!(r, e);
}
@ -166,7 +165,7 @@ mod tests {
fn _mm256_mul_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
let r = avx::_mm256_mul_ps(a, b);
let r = unsafe { avx::_mm256_mul_ps(a, b) };
let e = f32x8::new(9.0, 20.0, 33.0, 48.0, 65.0, 84.0, 105.0, 128.0);
assert_eq!(r, e);
}
@ -175,7 +174,7 @@ mod tests {
fn _mm256_addsub_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_addsub_pd(a, b);
let r = unsafe { avx::_mm256_addsub_pd(a, b) };
let e = f64x4::new(-4.0, 8.0, -4.0, 12.0);
assert_eq!(r, e);
}
@ -184,7 +183,7 @@ mod tests {
fn _mm256_addsub_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_addsub_ps(a, b);
let r = unsafe { avx::_mm256_addsub_ps(a, b) };
let e = f32x8::new(-4.0, 8.0, -4.0, 12.0, -4.0, 8.0, -4.0, 12.0);
assert_eq!(r, e);
}
@ -193,7 +192,7 @@ mod tests {
fn _mm256_sub_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_sub_pd(a, b);
let r = unsafe { avx::_mm256_sub_pd(a, b) };
let e = f64x4::new(-4.0,-4.0,-4.0,-4.0);
assert_eq!(r, e);
}
@ -202,7 +201,7 @@ mod tests {
fn _mm256_sub_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, -1.0, -2.0, -3.0, -4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 3.0, 2.0, 1.0, 0.0);
let r = avx::_mm256_sub_ps(a, b);
let r = unsafe { avx::_mm256_sub_ps(a, b) };
let e = f32x8::new(-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0);
assert_eq!(r, e);
}
@ -210,9 +209,9 @@ mod tests {
#[simd_test = "avx"]
fn _mm256_round_pd() {
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
let result_closest = avx::_mm256_round_pd(a, 0b00000000);
let result_down = avx::_mm256_round_pd(a, 0b00000001);
let result_up = avx::_mm256_round_pd(a, 0b00000010);
let result_closest = unsafe { avx::_mm256_round_pd(a, 0b00000000) };
let result_down = unsafe { avx::_mm256_round_pd(a, 0b00000001) };
let result_up = unsafe { avx::_mm256_round_pd(a, 0b00000010) };
let expected_closest = f64x4::new(2.0, 2.0, 4.0, -1.0);
let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
@ -224,7 +223,7 @@ mod tests {
#[simd_test = "avx"]
fn _mm256_floor_pd() {
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
let result_down = avx::_mm256_floor_pd(a);
let result_down = unsafe { avx::_mm256_floor_pd(a) };
let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
assert_eq!(result_down, expected_down);
}
@ -232,7 +231,7 @@ mod tests {
#[simd_test = "avx"]
fn _mm256_ceil_pd() {
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
let result_up = avx::_mm256_ceil_pd(a, );
let result_up = unsafe { avx::_mm256_ceil_pd(a) };
let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
assert_eq!(result_up, expected_up);
}

File diff suppressed because it is too large Load diff

View file

@ -10,20 +10,12 @@
#[cfg(test)]
use stdsimd_test::assert_instr;
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bextr.32"]
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.bextr.64"]
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
}
/// Extracts bits in range [`start`, `start` + `length`) from `a` into
/// the least significant bits of the result.
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
_bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
}
@ -33,7 +25,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
#[cfg(not(target_arch = "x86"))]
pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
_bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
}
@ -45,8 +37,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
unsafe { x86_bmi_bextr_32(a, control) }
pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
x86_bmi_bextr_32(a, control)
}
/// Extracts bits of `a` specified by `control` into
@ -58,15 +50,15 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(bextr))]
#[cfg(not(target_arch = "x86"))]
pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
unsafe { x86_bmi_bextr_64(a, control) }
pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
x86_bmi_bextr_64(a, control)
}
/// Bitwise logical `AND` of inverted `a` with `b`.
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(andn))]
pub fn _andn_u32(a: u32, b: u32) -> u32 {
pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
!a & b
}
@ -74,7 +66,7 @@ pub fn _andn_u32(a: u32, b: u32) -> u32 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(andn))]
pub fn _andn_u64(a: u64, b: u64) -> u64 {
pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
!a & b
}
@ -82,7 +74,7 @@ pub fn _andn_u64(a: u64, b: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsi))]
pub fn _blsi_u32(x: u32) -> u32 {
pub unsafe fn _blsi_u32(x: u32) -> u32 {
x & x.wrapping_neg()
}
@ -91,7 +83,7 @@ pub fn _blsi_u32(x: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsi))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsi_u64(x: u64) -> u64 {
pub unsafe fn _blsi_u64(x: u64) -> u64 {
x & x.wrapping_neg()
}
@ -99,7 +91,7 @@ pub fn _blsi_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsmsk))]
pub fn _blsmsk_u32(x: u32) -> u32 {
pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
x ^ (x.wrapping_sub(1u32))
}
@ -108,7 +100,7 @@ pub fn _blsmsk_u32(x: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsmsk))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsmsk_u64(x: u64) -> u64 {
pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
x ^ (x.wrapping_sub(1u64))
}
@ -118,7 +110,7 @@ pub fn _blsmsk_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsr))]
pub fn _blsr_u32(x: u32) -> u32 {
pub unsafe fn _blsr_u32(x: u32) -> u32 {
x & (x.wrapping_sub(1))
}
@ -129,7 +121,7 @@ pub fn _blsr_u32(x: u32) -> u32 {
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(blsr))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsr_u64(x: u64) -> u64 {
pub unsafe fn _blsr_u64(x: u64) -> u64 {
x & (x.wrapping_sub(1))
}
@ -139,7 +131,7 @@ pub fn _blsr_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _tzcnt_u16(x: u16) -> u16 {
pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
x.trailing_zeros() as u16
}
@ -149,7 +141,7 @@ pub fn _tzcnt_u16(x: u16) -> u16 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _tzcnt_u32(x: u32) -> u32 {
pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
x.trailing_zeros()
}
@ -159,7 +151,7 @@ pub fn _tzcnt_u32(x: u32) -> u32 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _tzcnt_u64(x: u64) -> u64 {
pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
x.trailing_zeros() as u64
}
@ -169,7 +161,7 @@ pub fn _tzcnt_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _mm_tzcnt_u32(x: u32) -> u32 {
pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
x.trailing_zeros()
}
@ -179,10 +171,18 @@ pub fn _mm_tzcnt_u32(x: u32) -> u32 {
#[inline(always)]
#[target_feature = "+bmi"]
#[cfg_attr(test, assert_instr(tzcnt))]
pub fn _mm_tzcnt_u64(x: u64) -> u64 {
pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
x.trailing_zeros() as u64
}
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bextr.32"]
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
#[link_name="llvm.x86.bmi.bextr.64"]
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
}
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
@ -191,98 +191,122 @@ mod tests {
#[simd_test = "bmi"]
fn _bextr_u32() {
assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
let r = unsafe { bmi::_bextr_u32(0b0101_0000u32, 4, 4) };
assert_eq!(r, 0b0000_0101u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _bextr_u64() {
assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
let r = unsafe { bmi::_bextr_u64(0b0101_0000u64, 4, 4) };
assert_eq!(r, 0b0000_0101u64);
}
#[simd_test = "bmi"]
fn _andn_u32() {
assert_eq!(bmi::_andn_u32(0, 0), 0);
assert_eq!(bmi::_andn_u32(0, 1), 1);
assert_eq!(bmi::_andn_u32(1, 0), 0);
assert_eq!(bmi::_andn_u32(1, 1), 0);
assert_eq!(unsafe { bmi::_andn_u32(0, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u32(0, 1) }, 1);
assert_eq!(unsafe { bmi::_andn_u32(1, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u32(1, 1) }, 0);
assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32);
assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32);
assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32);
assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32);
assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32) };
assert_eq!(r, 0b0000_0000u32);
let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32) };
assert_eq!(r, 0b1111_1111u32);
let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32) };
assert_eq!(r, 0b0000_0000u32);
let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32) };
assert_eq!(r, 0b0000_0000u32);
let r = unsafe { bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32) };
assert_eq!(r, 0b0001_1101u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _andn_u64() {
assert_eq!(bmi::_andn_u64(0, 0), 0);
assert_eq!(bmi::_andn_u64(0, 1), 1);
assert_eq!(bmi::_andn_u64(1, 0), 0);
assert_eq!(bmi::_andn_u64(1, 1), 0);
assert_eq!(unsafe { bmi::_andn_u64(0, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u64(0, 1) }, 1);
assert_eq!(unsafe { bmi::_andn_u64(1, 0) }, 0);
assert_eq!(unsafe { bmi::_andn_u64(1, 1) }, 0);
assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64);
assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64);
assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64);
assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64);
assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64) };
assert_eq!(r, 0b0000_0000u64);
let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64) };
assert_eq!(r, 0b1111_1111u64);
let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64) };
assert_eq!(r, 0b0000_0000u64);
let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64) };
assert_eq!(r, 0b0000_0000u64);
let r = unsafe { bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64) };
assert_eq!(r, 0b0001_1101u64);
}
#[simd_test = "bmi"]
fn _blsi_u32() {
assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
assert_eq!(unsafe { bmi::_blsi_u32(0b1101_0000u32) }, 0b0001_0000u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _blsi_u64() {
assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
assert_eq!(unsafe { bmi::_blsi_u64(0b1101_0000u64) }, 0b0001_0000u64);
}
#[simd_test = "bmi"]
fn _blsmsk_u32() {
assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
let r = unsafe { bmi::_blsmsk_u32(0b0011_0000u32) };
assert_eq!(r, 0b0001_1111u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _blsmsk_u64() {
assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
let r = unsafe { bmi::_blsmsk_u64(0b0011_0000u64) };
assert_eq!(r, 0b0001_1111u64);
}
#[simd_test = "bmi"]
fn _blsr_u32() {
/// TODO: test the behavior when the input is 0
assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
// TODO: test the behavior when the input is 0
let r = unsafe { bmi::_blsr_u32(0b0011_0000u32) };
assert_eq!(r, 0b0010_0000u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _blsr_u64() {
/// TODO: test the behavior when the input is 0
assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
// TODO: test the behavior when the input is 0
let r = unsafe { bmi::_blsr_u64(0b0011_0000u64) };
assert_eq!(r, 0b0010_0000u64);
}
#[simd_test = "bmi"]
fn _tzcnt_u16() {
assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0001u16) }, 0u16);
assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0000u16) }, 16u16);
assert_eq!(unsafe { bmi::_tzcnt_u16(0b1001_0000u16) }, 4u16);
}
#[simd_test = "bmi"]
fn _tzcnt_u32() {
assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0001u32) }, 0u32);
assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0000u32) }, 32u32);
assert_eq!(unsafe { bmi::_tzcnt_u32(0b1001_0000u32) }, 4u32);
}
#[simd_test = "bmi"]
#[cfg(not(target_arch = "x86"))]
fn _tzcnt_u64() {
assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64);
assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0001u64) }, 0u64);
assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0000u64) }, 64u64);
assert_eq!(unsafe { bmi::_tzcnt_u64(0b1001_0000u64) }, 4u64);
}
}

View file

@ -19,7 +19,7 @@ use stdsimd_test::assert_instr;
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
#[target_feature = "+bmi2"]
pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
pub unsafe fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
let result: u64 = (a as u64) * (b as u64);
let hi = (result >> 32) as u32;
(result as u32, hi)
@ -33,12 +33,67 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
#[cfg_attr(test, assert_instr(mulx))]
#[target_feature = "+bmi2"]
#[cfg(not(target_arch = "x86"))] // calls an intrinsic
pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
pub unsafe fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
let result: u128 = (a as u128) * (b as u128);
let hi = (result >> 64) as u64;
(result as u64, hi)
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
x86_bmi2_bzhi_32(a, index)
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
#[cfg(not(target_arch = "x86"))]
pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 {
x86_bmi2_bzhi_64(a, index)
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
x86_bmi2_pdep_32(a, mask)
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
#[cfg(not(target_arch = "x86"))]
pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
x86_bmi2_pdep_64(a, mask)
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
x86_bmi2_pext_32(a, mask)
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
#[cfg(not(target_arch = "x86"))]
pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
x86_bmi2_pext_64(a, mask)
}
#[allow(dead_code)]
extern "C" {
#[link_name="llvm.x86.bmi.bzhi.32"]
@ -55,63 +110,6 @@ extern "C" {
fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
unsafe { x86_bmi2_bzhi_32(a, index) }
}
/// Zero higher bits of `a` >= `index`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(bzhi))]
#[cfg(not(target_arch = "x86"))]
pub fn _bzhi_u64(a: u64, index: u64) -> u64 {
unsafe { x86_bmi2_bzhi_64(a, index) }
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
unsafe { x86_bmi2_pdep_32(a, mask) }
}
/// Scatter contiguous low order bits of `a` to the result at the positions
/// specified by the `mask`.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pdep))]
#[cfg(not(target_arch = "x86"))]
pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
unsafe { x86_bmi2_pdep_64(a, mask) }
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
pub fn _pext_u32(a: u32, mask: u32) -> u32 {
unsafe { x86_bmi2_pext_32(a, mask) }
}
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
/// order bit positions of the result.
#[inline(always)]
#[target_feature = "+bmi2"]
#[cfg_attr(test, assert_instr(pext))]
#[cfg(not(target_arch = "x86"))]
pub fn _pext_u64(a: u64, mask: u64) -> u64 {
unsafe { x86_bmi2_pext_64(a, mask) }
}
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
@ -128,8 +126,8 @@ mod tests {
let m1 = 0b1110_1011_1110_1111u32;
let s1 = 0b0001_0111_0100_0011u32;
assert_eq!(bmi2::_pext_u32(n, m0), s0);
assert_eq!(bmi2::_pext_u32(n, m1), s1);
assert_eq!(unsafe { bmi2::_pext_u32(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pext_u32(n, m1) }, s1);
}
#[simd_test = "bmi2"]
@ -143,8 +141,8 @@ mod tests {
let m1 = 0b1110_1011_1110_1111u64;
let s1 = 0b0001_0111_0100_0011u64;
assert_eq!(bmi2::_pext_u64(n, m0), s0);
assert_eq!(bmi2::_pext_u64(n, m1), s1);
assert_eq!(unsafe { bmi2::_pext_u64(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pext_u64(n, m1) }, s1);
}
#[simd_test = "bmi2"]
@ -157,8 +155,8 @@ mod tests {
let m1 = 0b1110_1011_1110_1111u32;
let s1 = 0b1110_1001_0010_0011u32;
assert_eq!(bmi2::_pdep_u32(n, m0), s0);
assert_eq!(bmi2::_pdep_u32(n, m1), s1);
assert_eq!(unsafe { bmi2::_pdep_u32(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pdep_u32(n, m1) }, s1);
}
#[simd_test = "bmi2"]
@ -172,15 +170,15 @@ mod tests {
let m1 = 0b1110_1011_1110_1111u64;
let s1 = 0b1110_1001_0010_0011u64;
assert_eq!(bmi2::_pdep_u64(n, m0), s0);
assert_eq!(bmi2::_pdep_u64(n, m1), s1);
assert_eq!(unsafe { bmi2::_pdep_u64(n, m0) }, s0);
assert_eq!(unsafe { bmi2::_pdep_u64(n, m1) }, s1);
}
#[simd_test = "bmi2"]
fn _bzhi_u32() {
let n = 0b1111_0010u32;
let s = 0b0001_0010u32;
assert_eq!(bmi2::_bzhi_u32(n, 5), s);
assert_eq!(unsafe { bmi2::_bzhi_u32(n, 5) }, s);
}
#[simd_test = "bmi2"]
@ -188,14 +186,14 @@ mod tests {
fn _bzhi_u64() {
let n = 0b1111_0010u64;
let s = 0b0001_0010u64;
assert_eq!(bmi2::_bzhi_u64(n, 5), s);
assert_eq!(unsafe { bmi2::_bzhi_u64(n, 5) }, s);
}
#[simd_test = "bmi2"]
fn _mulx_u32() {
let a: u32 = 4_294_967_200;
let b: u32 = 2;
let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b);
let (lo, hi): (u32, u32) = unsafe { bmi2::_mulx_u32(a, b) };
// result = 8589934400
// = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
// ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -208,7 +206,7 @@ mod tests {
fn _mulx_u64() {
let a: u64 = 9_223_372_036_854_775_800;
let b: u64 = 100;
let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b);
let (lo, hi): (u64, u64) = unsafe { bmi2::_mulx_u64(a, b) };
// result = 922337203685477580000
// = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
// ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View file

@ -9,15 +9,15 @@ use stdsimd_test::assert_instr;
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(addss))]
pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { addss(a, b) }
pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
addss(a, b)
}
/// Adds f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(addps))]
pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
a + b
}
@ -26,15 +26,15 @@ pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(subss))]
pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { subss(a, b) }
pub unsafe fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
subss(a, b)
}
/// Subtracts f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(subps))]
pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
a - b
}
@ -43,15 +43,15 @@ pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(mulss))]
pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { mulss(a, b) }
pub unsafe fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
mulss(a, b)
}
/// Multiplies f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(mulps))]
pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
a * b
}
@ -60,15 +60,15 @@ pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(divss))]
pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { divss(a, b) }
pub unsafe fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
divss(a, b)
}
/// Divides f32x4 vectors.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(divps))]
pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
a / b
}
@ -77,8 +77,8 @@ pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(sqrtss))]
pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
unsafe { sqrtss(a) }
pub unsafe fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
sqrtss(a)
}
/// Return the square root of packed single-precision (32-bit) floating-point
@ -86,8 +86,8 @@ pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(sqrtps))]
pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
unsafe { sqrtps(a) }
pub unsafe fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
sqrtps(a)
}
/// Return the approximate reciprocal of the first single-precision
@ -95,8 +95,8 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rcpss))]
pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
unsafe { rcpss(a) }
pub unsafe fn _mm_rcp_ss(a: f32x4) -> f32x4 {
rcpss(a)
}
/// Return the approximate reciprocal of packed single-precision (32-bit)
@ -104,8 +104,8 @@ pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rcpps))]
pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
unsafe { rcpps(a) }
pub unsafe fn _mm_rcp_ps(a: f32x4) -> f32x4 {
rcpps(a)
}
/// Return the approximate reciprocal square root of the fist single-precision
@ -113,8 +113,8 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rsqrtss))]
pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
unsafe { rsqrtss(a) }
pub unsafe fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
rsqrtss(a)
}
/// Return the approximate reciprocal square root of packed single-precision
@ -122,8 +122,8 @@ pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(rsqrtps))]
pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
unsafe { rsqrtps(a) }
pub unsafe fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
rsqrtps(a)
}
/// Compare the first single-precision (32-bit) floating-point element of `a`
@ -132,8 +132,8 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(minss))]
pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { minss(a, b) }
pub unsafe fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
minss(a, b)
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
@ -141,8 +141,8 @@ pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(minps))]
pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { minps(a, b) }
pub unsafe fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
minps(a, b)
}
/// Compare the first single-precision (32-bit) floating-point element of `a`
@ -151,8 +151,8 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(maxss))]
pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
unsafe { maxss(a, b) }
pub unsafe fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
maxss(a, b)
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
@ -160,24 +160,23 @@ pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(maxps))]
pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { maxps(a, b) }
pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
maxps(a, b)
}
// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
// using `mask`.
// The lower half of result takes values from `a` and the higher half from `b`.
// Mask is split to 2 control bits each to index the element from inputs.
/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
/// `b` using `mask`.
///
/// The lower half of result takes values from `a` and the higher half from
/// `b`. Mask is split to 2 control bits each to index the element from inputs.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
let mask = (mask & 0xFF) as u8;
macro_rules! shuffle_done {
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
unsafe {
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
}
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
}
}
macro_rules! shuffle_x67 {
@ -219,10 +218,10 @@ pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
}
#[cfg(test)]
#[cfg_attr(test, assert_instr(shufps))]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(shufps))]
fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
_mm_shuffle_ps(a, b, 3)
unsafe { _mm_shuffle_ps(a, b, 3) }
}
/// Unpack and interleave single-precision (32-bit) floating-point elements
@ -230,8 +229,8 @@ fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(unpckhps))]
pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
pub unsafe fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
simd_shuffle4(a, b, [2, 6, 3, 7])
}
/// Unpack and interleave single-precision (32-bit) floating-point elements
@ -239,8 +238,8 @@ pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(unpcklps))]
pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
simd_shuffle4(a, b, [0, 4, 1, 5])
}
/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
@ -249,9 +248,9 @@ pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
#[target_feature = "+sse"]
#[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
#[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
// TODO; figure why this is a different instruction on Windows?
unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
simd_shuffle4(a, b, [6, 7, 2, 3])
}
/// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
@ -259,8 +258,8 @@ pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(unpcklpd))]
pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
pub unsafe fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
simd_shuffle4(a, b, [0, 1, 4, 5])
}
/// Return a mask of the most significant bit of each element in `a`.
@ -270,8 +269,8 @@ pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(movmskps))]
pub fn _mm_movemask_ps(a: f32x4) -> i32 {
unsafe { movmskps(a) }
pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
movmskps(a)
}
#[allow(improper_ctypes)]
@ -318,7 +317,7 @@ mod tests {
fn _mm_add_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_add_ps(a, b);
let r = unsafe { sse::_mm_add_ps(a, b) };
assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
}
@ -326,7 +325,7 @@ mod tests {
fn _mm_add_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_add_ss(a, b);
let r = unsafe { sse::_mm_add_ss(a, b) };
assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
}
@ -334,7 +333,7 @@ mod tests {
fn _mm_sub_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_sub_ps(a, b);
let r = unsafe { sse::_mm_sub_ps(a, b) };
assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
}
@ -342,7 +341,7 @@ mod tests {
fn _mm_sub_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_sub_ss(a, b);
let r = unsafe { sse::_mm_sub_ss(a, b) };
assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
}
@ -350,7 +349,7 @@ mod tests {
fn _mm_mul_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_mul_ps(a, b);
let r = unsafe { sse::_mm_mul_ps(a, b) };
assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
}
@ -358,7 +357,7 @@ mod tests {
fn _mm_mul_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_mul_ss(a, b);
let r = unsafe { sse::_mm_mul_ss(a, b) };
assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
}
@ -366,7 +365,7 @@ mod tests {
fn _mm_div_ps() {
let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
let r = sse::_mm_div_ps(a, b);
let r = unsafe { sse::_mm_div_ps(a, b) };
assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
}
@ -374,14 +373,14 @@ mod tests {
fn _mm_div_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_div_ss(a, b);
let r = unsafe { sse::_mm_div_ss(a, b) };
assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
}
#[simd_test = "sse"]
fn _mm_sqrt_ss() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_sqrt_ss(a);
let r = unsafe { sse::_mm_sqrt_ss(a) };
let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
assert_eq!(r, e);
}
@ -389,7 +388,7 @@ mod tests {
#[simd_test = "sse"]
fn _mm_sqrt_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_sqrt_ps(a);
let r = unsafe { sse::_mm_sqrt_ps(a) };
let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
assert_eq!(r, e);
}
@ -397,7 +396,7 @@ mod tests {
#[simd_test = "sse"]
fn _mm_rcp_ss() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rcp_ss(a);
let r = unsafe { sse::_mm_rcp_ss(a) };
let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
assert_eq!(r, e);
}
@ -405,7 +404,7 @@ mod tests {
#[simd_test = "sse"]
fn _mm_rcp_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rcp_ps(a);
let r = unsafe { sse::_mm_rcp_ps(a) };
let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
assert_eq!(r, e);
}
@ -413,7 +412,7 @@ mod tests {
#[simd_test = "sse"]
fn _mm_rsqrt_ss() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rsqrt_ss(a);
let r = unsafe { sse::_mm_rsqrt_ss(a) };
let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
assert_eq!(r, e);
}
@ -421,7 +420,7 @@ mod tests {
#[simd_test = "sse"]
fn _mm_rsqrt_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rsqrt_ps(a);
let r = unsafe { sse::_mm_rsqrt_ps(a) };
let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
assert_eq!(r, e);
}
@ -430,7 +429,7 @@ mod tests {
fn _mm_min_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_min_ss(a, b);
let r = unsafe { sse::_mm_min_ss(a, b) };
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
}
@ -438,7 +437,7 @@ mod tests {
fn _mm_min_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_min_ps(a, b);
let r = unsafe { sse::_mm_min_ps(a, b) };
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
}
@ -446,7 +445,7 @@ mod tests {
fn _mm_max_ss() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_max_ss(a, b);
let r = unsafe { sse::_mm_max_ss(a, b) };
assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
}
@ -454,7 +453,7 @@ mod tests {
fn _mm_max_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_max_ps(a, b);
let r = unsafe { sse::_mm_max_ps(a, b) };
assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
}
@ -463,7 +462,7 @@ mod tests {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let mask = 0b00_01_01_11;
let r = sse::_mm_shuffle_ps(a, b, mask);
let r = unsafe { sse::_mm_shuffle_ps(a, b, mask) };
assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
}
@ -471,7 +470,7 @@ mod tests {
fn _mm_unpackhi_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_unpackhi_ps(a, b);
let r = unsafe { sse::_mm_unpackhi_ps(a, b) };
assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
}
@ -479,7 +478,7 @@ mod tests {
fn _mm_unpacklo_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_unpacklo_ps(a, b);
let r = unsafe { sse::_mm_unpacklo_ps(a, b) };
assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
}
@ -487,7 +486,7 @@ mod tests {
fn _mm_movehl_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_movehl_ps(a, b);
let r = unsafe { sse::_mm_movehl_ps(a, b) };
assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
}
@ -495,16 +494,20 @@ mod tests {
fn _mm_movelh_ps() {
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
let r = sse::_mm_movelh_ps(a, b);
let r = unsafe { sse::_mm_movelh_ps(a, b) };
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
}
#[simd_test = "sse"]
fn _mm_movemask_ps() {
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
let r = unsafe {
sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0))
};
assert_eq!(r, 0b0101);
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
let r = unsafe {
sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0))
};
assert_eq!(r, 0b0111);
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,18 +1,18 @@
use v128::*;
use x86::__m128i;
#[cfg(test)]
use stdsimd_test::assert_instr;
use v128::*;
use x86::__m128i;
#[inline(always)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(pblendvb))]
pub fn _mm_blendv_epi8(
pub unsafe fn _mm_blendv_epi8(
a: __m128i,
b: __m128i,
mask: __m128i,
) -> __m128i {
unsafe { pblendvb(a, b, mask) }
pblendvb(a, b, mask)
}
/// Returns the dot product of two f64x2 vectors.
@ -24,15 +24,20 @@ pub fn _mm_blendv_epi8(
/// the broadcast mask bit is zero then the return component will be zero.
#[inline(always)]
#[target_feature = "+sse4.1"]
pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
macro_rules! call {
($imm8:expr) => {
unsafe { dppd(a, b, $imm8) }
}
($imm8:expr) => { dppd(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
#[cfg(test)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(dppd))]
fn _test_mm_dp_pd(a: f64x2, b: f64x2) -> f64x2 {
unsafe { _mm_dp_pd(a, b, 0) }
}
/// Returns the dot product of two f32x4 vectors.
///
/// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
@ -42,15 +47,20 @@ pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
/// the broadcast mask bit is zero then the return component will be zero.
#[inline(always)]
#[target_feature = "+sse4.1"]
pub fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
macro_rules! call {
($imm8:expr) => {
unsafe { dpps(a, b, $imm8) }
}
($imm8:expr) => { dpps(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
#[cfg(test)]
#[target_feature = "+sse4.1"]
#[cfg_attr(test, assert_instr(dpps))]
fn _test_mm_dp_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { _mm_dp_ps(a, b, 0) }
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse41.pblendvb"]
@ -78,7 +88,7 @@ mod tests {
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
let e = i8x16::new(
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
assert_eq!(unsafe { sse41::_mm_blendv_epi8(a, b, mask) }, e);
}
#[simd_test = "sse4.1"]
@ -86,7 +96,7 @@ mod tests {
let a = f64x2::new(2.0, 3.0);
let b = f64x2::new(1.0, 4.0);
let e = f64x2::new(14.0, 0.0);
assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
assert_eq!(unsafe { sse41::_mm_dp_pd(a, b, 0b00110001) }, e);
}
#[simd_test = "sse4.1"]
@ -94,6 +104,6 @@ mod tests {
let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
let e = f32x4::new(14.5, 0.0, 14.5, 0.0);
assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e);
assert_eq!(unsafe { sse41::_mm_dp_ps(a, b, 0b01110101) }, e);
}
}

View file

@ -1,3 +1,6 @@
#[cfg(test)]
use stdsimd_test::assert_instr;
use x86::__m128i;
pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;
@ -19,7 +22,7 @@ pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b01000000;
#[inline(always)]
#[target_feature = "+sse4.2"]
pub fn _mm_cmpestri(
pub unsafe fn _mm_cmpestri(
a: __m128i,
la: i32,
b: __m128i,
@ -27,13 +30,18 @@ pub fn _mm_cmpestri(
imm8: i8,
) -> i32 {
macro_rules! call {
($imm8:expr) => {
unsafe { pcmpestri128(a, la, b, lb, $imm8) }
}
($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
}
constify_imm8!(imm8, call)
}
#[cfg(test)]
#[target_feature = "+sse4.2"]
#[cfg_attr(test, assert_instr(pcmpestri))]
fn _test_mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
unsafe { _mm_cmpestri(a, la, b, lb, 0) }
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse42.pcmpestri128"]
@ -53,8 +61,10 @@ mod tests {
let b = &b"foobar "[..];
let va = __m128i::from(u8x16::load(a, 0));
let vb = __m128i::from(u8x16::load(b, 0));
let i = sse42::_mm_cmpestri(
va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
let i = unsafe {
sse42::_mm_cmpestri(
va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED)
};
assert_eq!(3, i);
}
}

View file

@ -1,15 +1,15 @@
use v128::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
use v128::*;
/// Compute the absolute value of packed 8-bit signed integers in `a` and
/// return the unsigned results.
#[inline(always)]
#[target_feature = "+ssse3"]
#[cfg_attr(test, assert_instr(pabsb))]
pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
unsafe { pabsb128(a) }
pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
pabsb128(a)
}
/// Shuffle bytes from `a` according to the content of `b`.
@ -39,8 +39,8 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
#[inline(always)]
#[target_feature = "+ssse3"]
#[cfg_attr(test, assert_instr(pshufb))]
pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
unsafe { pshufb128(a, b) }
pub unsafe fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
pshufb128(a, b)
}
@ -48,7 +48,6 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
extern {
#[link_name = "llvm.x86.ssse3.pabs.b.128"]
fn pabsb128(a: i8x16) -> u8x16;
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
}
@ -62,16 +61,31 @@ mod tests {
#[simd_test = "ssse3"]
fn _mm_abs_epi8() {
let r = ssse3::_mm_abs_epi8(i8x16::splat(-5));
let r = unsafe { ssse3::_mm_abs_epi8(i8x16::splat(-5)) };
assert_eq!(r, u8x16::splat(5));
}
#[simd_test = "ssse3"]
fn _mm_shuffle_epi8() {
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
let expected = u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
let r = ssse3::_mm_shuffle_epi8(a, b);
let a = u8x16::new(
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16,
);
let b = u8x16::new(
4, 128, 4, 3,
24, 12, 6, 19,
12, 5, 5, 10,
4, 1, 8, 0,
);
let expected = u8x16::new(
5, 0, 5, 4,
9, 13, 7, 4,
13, 6, 6, 11,
5, 2, 9, 1,
);
let r = unsafe { ssse3::_mm_shuffle_epi8(a, b) };
assert_eq!(r, expected);
}
}

View file

@ -65,7 +65,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcfill))]
pub fn _blcfill_u32(x: u32) -> u32 {
pub unsafe fn _blcfill_u32(x: u32) -> u32 {
x & (x.wrapping_add(1))
}
@ -76,7 +76,7 @@ pub fn _blcfill_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcfill))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcfill_u64(x: u64) -> u64 {
pub unsafe fn _blcfill_u64(x: u64) -> u64 {
x & (x.wrapping_add(1))
}
@ -86,7 +86,7 @@ pub fn _blcfill_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blci))]
pub fn _blci_u32(x: u32) -> u32 {
pub unsafe fn _blci_u32(x: u32) -> u32 {
x | !(x.wrapping_add(1))
}
@ -97,7 +97,7 @@ pub fn _blci_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blci))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blci_u64(x: u64) -> u64 {
pub unsafe fn _blci_u64(x: u64) -> u64 {
x | !(x.wrapping_add(1))
}
@ -107,7 +107,7 @@ pub fn _blci_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcic))]
pub fn _blcic_u32(x: u32) -> u32 {
pub unsafe fn _blcic_u32(x: u32) -> u32 {
!x & (x.wrapping_add(1))
}
@ -118,7 +118,7 @@ pub fn _blcic_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcic))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcic_u64(x: u64) -> u64 {
pub unsafe fn _blcic_u64(x: u64) -> u64 {
!x & (x.wrapping_add(1))
}
@ -128,7 +128,7 @@ pub fn _blcic_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcmsk))]
pub fn _blcmsk_u32(x: u32) -> u32 {
pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
x ^ (x.wrapping_add(1))
}
@ -139,7 +139,7 @@ pub fn _blcmsk_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcmsk))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcmsk_u64(x: u64) -> u64 {
pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
x ^ (x.wrapping_add(1))
}
@ -149,7 +149,7 @@ pub fn _blcmsk_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcs))]
pub fn _blcs_u32(x: u32) -> u32 {
pub unsafe fn _blcs_u32(x: u32) -> u32 {
x | (x.wrapping_add(1))
}
@ -160,7 +160,7 @@ pub fn _blcs_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blcs))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blcs_u64(x: u64) -> u64 {
pub unsafe fn _blcs_u64(x: u64) -> u64 {
x | x.wrapping_add(1)
}
@ -170,7 +170,7 @@ pub fn _blcs_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsfill))]
pub fn _blsfill_u32(x: u32) -> u32 {
pub unsafe fn _blsfill_u32(x: u32) -> u32 {
x | (x.wrapping_sub(1))
}
@ -181,7 +181,7 @@ pub fn _blsfill_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsfill))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsfill_u64(x: u64) -> u64 {
pub unsafe fn _blsfill_u64(x: u64) -> u64 {
x | (x.wrapping_sub(1))
}
@ -191,7 +191,7 @@ pub fn _blsfill_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsic))]
pub fn _blsic_u32(x: u32) -> u32 {
pub unsafe fn _blsic_u32(x: u32) -> u32 {
!x | (x.wrapping_sub(1))
}
@ -202,7 +202,7 @@ pub fn _blsic_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(blsic))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _blsic_u64(x: u64) -> u64 {
pub unsafe fn _blsic_u64(x: u64) -> u64 {
!x | (x.wrapping_sub(1))
}
@ -213,7 +213,7 @@ pub fn _blsic_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(t1mskc))]
pub fn _t1mskc_u32(x: u32) -> u32 {
pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
!x | (x.wrapping_add(1))
}
@ -225,7 +225,7 @@ pub fn _t1mskc_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(t1mskc))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _t1mskc_u64(x: u64) -> u64 {
pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
!x | (x.wrapping_add(1))
}
@ -236,7 +236,7 @@ pub fn _t1mskc_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(tzmsk))]
pub fn _tzmsk_u32(x: u32) -> u32 {
pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
!x & (x.wrapping_sub(1))
}
@ -248,7 +248,7 @@ pub fn _tzmsk_u32(x: u32) -> u32 {
#[target_feature = "+tbm"]
#[cfg_attr(test, assert_instr(tzmsk))]
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
pub fn _tzmsk_u64(x: u64) -> u64 {
pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
!x & (x.wrapping_sub(1))
}
@ -272,122 +272,174 @@ mod tests {
#[simd_test = "tbm"]
fn _blcfill_u32() {
assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
assert_eq!(
unsafe { tbm::_blcfill_u32(0b0101_0111u32) },
0b0101_0000u32);
assert_eq!(
unsafe { tbm::_blcfill_u32(0b1111_1111u32) },
0u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcfill_u64() {
assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
assert_eq!(
unsafe { tbm::_blcfill_u64(0b0101_0111u64) },
0b0101_0000u64);
assert_eq!(
unsafe { tbm::_blcfill_u64(0b1111_1111u64) },
0u64);
}
#[simd_test = "tbm"]
fn _blci_u32() {
assert_eq!(tbm::_blci_u32(0b0101_0000u32),
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
assert_eq!(tbm::_blci_u32(0b1111_1111u32),
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
assert_eq!(
unsafe { tbm::_blci_u32(0b0101_0000u32) },
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
assert_eq!(
unsafe { tbm::_blci_u32(0b1111_1111u32) },
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blci_u64() {
assert_eq!(tbm::_blci_u64(0b0101_0000u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
assert_eq!(tbm::_blci_u64(0b1111_1111u64),
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
assert_eq!(
unsafe { tbm::_blci_u64(0b0101_0000u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
assert_eq!(
unsafe { tbm::_blci_u64(0b1111_1111u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blcic_u32() {
assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
assert_eq!(
unsafe { tbm::_blcic_u32(0b0101_0001u32) },
0b0000_0010u32);
assert_eq!(
unsafe { tbm::_blcic_u32(0b1111_1111u32) },
0b1_0000_0000u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcic_u64() {
assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
assert_eq!(
unsafe { tbm::_blcic_u64(0b0101_0001u64) },
0b0000_0010u64);
assert_eq!(
unsafe { tbm::_blcic_u64(0b1111_1111u64) },
0b1_0000_0000u64);
}
#[simd_test = "tbm"]
fn _blcmsk_u32() {
assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
assert_eq!(
unsafe { tbm::_blcmsk_u32(0b0101_0001u32) },
0b0000_0011u32);
assert_eq!(
unsafe { tbm::_blcmsk_u32(0b1111_1111u32) },
0b1_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcmsk_u64() {
assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
assert_eq!(
unsafe { tbm::_blcmsk_u64(0b0101_0001u64) },
0b0000_0011u64);
assert_eq!(
unsafe { tbm::_blcmsk_u64(0b1111_1111u64) },
0b1_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blcs_u32() {
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
assert_eq!(unsafe { tbm::_blcs_u32(0b0101_0001u32) }, 0b0101_0011u32);
assert_eq!(unsafe { tbm::_blcs_u32(0b1111_1111u32) }, 0b1_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blcs_u64() {
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
assert_eq!(unsafe { tbm::_blcs_u64(0b0101_0001u64) }, 0b0101_0011u64);
assert_eq!(unsafe { tbm::_blcs_u64(0b1111_1111u64) }, 0b1_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blsfill_u32() {
assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
unsafe { tbm::_blsfill_u32(0b0101_0100u32) },
0b0101_0111u32);
assert_eq!(
unsafe { tbm::_blsfill_u32(0u32) },
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blsfill_u64() {
assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
unsafe { tbm::_blsfill_u64(0b0101_0100u64) },
0b0101_0111u64);
assert_eq!(
unsafe { tbm::_blsfill_u64(0u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
}
#[simd_test = "tbm"]
fn _blsic_u32() {
assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
unsafe { tbm::_blsic_u32(0b0101_0100u32) },
0b1111_1111_1111_1111_1111_1111_1111_1011u32);
assert_eq!(
unsafe { tbm::_blsic_u32(0u32) },
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _blsic_u64() {
assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
unsafe { tbm::_blsic_u64(0b0101_0100u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
assert_eq!(
unsafe { tbm::_blsic_u64(0u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
}
#[simd_test = "tbm"]
fn _t1mskc_u32() {
assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
assert_eq!(
unsafe { tbm::_t1mskc_u32(0b0101_0111u32) },
0b1111_1111_1111_1111_1111_1111_1111_1000u32);
assert_eq!(
unsafe { tbm::_t1mskc_u32(0u32) },
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _t1mksc_u64() {
assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
assert_eq!(
unsafe { tbm::_t1mskc_u64(0b0101_0111u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
assert_eq!(
unsafe { tbm::_t1mskc_u64(0u64) },
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
}
#[simd_test = "tbm"]
fn _tzmsk_u32() {
assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1000u32) }, 0b0000_0111u32);
assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1001u32) }, 0b0000_0000u32);
}
#[simd_test = "tbm"]
#[cfg(not(target_arch = "x86"))]
fn _tzmsk_u64() {
assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1000u64) }, 0b0000_0111u64);
assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1001u64) }, 0b0000_0000u64);
}
}