x86: add unsafe to all x86 vendor intrinsics
Also, add missing assert_instr tests to each intrinsic, where possible.
This commit is contained in:
parent
ff9e960628
commit
6dfc65289c
12 changed files with 1611 additions and 1213 deletions
|
|
@ -24,9 +24,11 @@ mod example {
|
|||
haystack.resize(16, 0);
|
||||
let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));
|
||||
|
||||
vendor::_mm_cmpestri(
|
||||
vneedle, needle_len as i32, vhaystack, hay_len as i32,
|
||||
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
|
||||
unsafe {
|
||||
vendor::_mm_cmpestri(
|
||||
vneedle, needle_len as i32, vhaystack, hay_len as i32,
|
||||
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ use stdsimd_test::assert_instr;
|
|||
#[inline(always)]
|
||||
#[target_feature = "+lzcnt"]
|
||||
#[cfg_attr(test, assert_instr(lzcnt))]
|
||||
pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
|
||||
pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
|
||||
|
||||
/// Counts the leading most significant zero bits.
|
||||
///
|
||||
|
|
@ -27,19 +27,19 @@ pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
|
|||
#[inline(always)]
|
||||
#[target_feature = "+lzcnt"]
|
||||
#[cfg_attr(test, assert_instr(lzcnt))]
|
||||
pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
|
||||
pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
|
||||
|
||||
/// Counts the bits that are set.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+popcnt"]
|
||||
#[cfg_attr(test, assert_instr(popcnt))]
|
||||
pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
|
||||
pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }
|
||||
|
||||
/// Counts the bits that are set.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+popcnt"]
|
||||
#[cfg_attr(test, assert_instr(popcnt))]
|
||||
pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
|
||||
pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
|
@ -49,21 +49,21 @@ mod tests {
|
|||
|
||||
#[simd_test = "lzcnt"]
|
||||
fn _lzcnt_u32() {
|
||||
assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
|
||||
assert_eq!(unsafe { abm::_lzcnt_u32(0b0101_1010u32) }, 25u32);
|
||||
}
|
||||
|
||||
#[simd_test = "lzcnt"]
|
||||
fn _lzcnt_u64() {
|
||||
assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
|
||||
assert_eq!(unsafe { abm::_lzcnt_u64(0b0101_1010u64) }, 57u64);
|
||||
}
|
||||
|
||||
#[simd_test = "popcnt"]
|
||||
fn _popcnt32() {
|
||||
assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
|
||||
assert_eq!(unsafe { abm::_popcnt32(0b0101_1010u32) }, 4);
|
||||
}
|
||||
|
||||
#[simd_test = "popcnt"]
|
||||
fn _popcnt64() {
|
||||
assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
|
||||
assert_eq!(unsafe { abm::_popcnt64(0b0101_1010u64) }, 4);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
use v256::*;
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
use v256::*;
|
||||
|
||||
/// Add packed double-precision (64-bit) floating-point elements
|
||||
/// in `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vaddpd))]
|
||||
pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
a + b
|
||||
}
|
||||
|
||||
|
|
@ -16,7 +16,7 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vaddps))]
|
||||
pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
a + b
|
||||
}
|
||||
|
||||
|
|
@ -25,7 +25,7 @@ pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmulpd))]
|
||||
pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
a * b
|
||||
}
|
||||
|
||||
|
|
@ -33,7 +33,7 @@ pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmulps))]
|
||||
pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
pub unsafe fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
a * b
|
||||
}
|
||||
|
||||
|
|
@ -42,8 +42,8 @@ pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vaddsubpd))]
|
||||
pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
unsafe { addsubpd256(a, b) }
|
||||
pub unsafe fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
addsubpd256(a, b)
|
||||
}
|
||||
|
||||
/// Alternatively add and subtract packed single-precision (32-bit)
|
||||
|
|
@ -51,8 +51,8 @@ pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vaddsubps))]
|
||||
pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
unsafe { addsubps256(a, b) }
|
||||
pub unsafe fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
addsubps256(a, b)
|
||||
}
|
||||
|
||||
/// Subtract packed double-precision (64-bit) floating-point elements in `b`
|
||||
|
|
@ -60,7 +60,7 @@ pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vsubpd))]
|
||||
pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
pub unsafe fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
|
||||
a - b
|
||||
}
|
||||
|
||||
|
|
@ -69,25 +69,24 @@ pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vsubps))]
|
||||
pub fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
pub unsafe fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
|
||||
a - b
|
||||
}
|
||||
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a`
|
||||
/// according to the flag `b`. The value of `b` may be as follows:
|
||||
///
|
||||
/// ```ignore
|
||||
/// 0x00: Round to the nearest whole number.
|
||||
/// 0x01: Round down, toward negative infinity.
|
||||
/// 0x02: Round up, toward positive infinity.
|
||||
/// 0x03: Truncate the values.
|
||||
/// For a few additional values options, check the LLVM docs:
|
||||
/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
|
||||
/// ```
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
|
||||
pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
unsafe { roundpd256(a, $imm8) }
|
||||
}
|
||||
($imm8:expr) => { roundpd256(a, $imm8) }
|
||||
}
|
||||
constify_imm8!(b, call)
|
||||
}
|
||||
|
|
@ -96,7 +95,7 @@ pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
|
|||
#[cfg_attr(test, assert_instr(vroundpd))]
|
||||
#[target_feature = "+avx"]
|
||||
fn test_mm256_round_pd(a: f64x4) -> f64x4 {
|
||||
_mm256_round_pd(a, 0x3)
|
||||
unsafe { _mm256_round_pd(a, 0x3) }
|
||||
}
|
||||
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a` toward
|
||||
|
|
@ -104,8 +103,8 @@ fn test_mm256_round_pd(a: f64x4) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundpd))]
|
||||
pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
|
||||
unsafe { roundpd256(a, 0x02) }
|
||||
pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
|
||||
roundpd256(a, 0x02)
|
||||
}
|
||||
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a` toward
|
||||
|
|
@ -113,8 +112,8 @@ pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundpd))]
|
||||
pub fn _mm256_floor_pd(a: f64x4) -> f64x4 {
|
||||
unsafe { roundpd256(a, 0x01) }
|
||||
pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
|
||||
roundpd256(a, 0x01)
|
||||
}
|
||||
|
||||
/// LLVM intrinsics used in the above functions
|
||||
|
|
@ -139,7 +138,7 @@ mod tests {
|
|||
fn _mm256_add_pd() {
|
||||
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = avx::_mm256_add_pd(a, b);
|
||||
let r = unsafe { avx::_mm256_add_pd(a, b) };
|
||||
let e = f64x4::new(6.0, 8.0, 10.0, 12.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -148,7 +147,7 @@ mod tests {
|
|||
fn _mm256_add_ps() {
|
||||
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
|
||||
let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
|
||||
let r = avx::_mm256_add_ps(a, b);
|
||||
let r = unsafe { avx::_mm256_add_ps(a, b) };
|
||||
let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -157,7 +156,7 @@ mod tests {
|
|||
fn _mm256_mul_pd() {
|
||||
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = avx::_mm256_mul_pd(a, b);
|
||||
let r = unsafe { avx::_mm256_mul_pd(a, b) };
|
||||
let e = f64x4::new(5.0, 12.0, 21.0, 32.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -166,7 +165,7 @@ mod tests {
|
|||
fn _mm256_mul_ps() {
|
||||
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
|
||||
let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
|
||||
let r = avx::_mm256_mul_ps(a, b);
|
||||
let r = unsafe { avx::_mm256_mul_ps(a, b) };
|
||||
let e = f32x8::new(9.0, 20.0, 33.0, 48.0, 65.0, 84.0, 105.0, 128.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -175,7 +174,7 @@ mod tests {
|
|||
fn _mm256_addsub_pd() {
|
||||
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = avx::_mm256_addsub_pd(a, b);
|
||||
let r = unsafe { avx::_mm256_addsub_pd(a, b) };
|
||||
let e = f64x4::new(-4.0, 8.0, -4.0, 12.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -184,7 +183,7 @@ mod tests {
|
|||
fn _mm256_addsub_ps() {
|
||||
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
|
||||
let r = avx::_mm256_addsub_ps(a, b);
|
||||
let r = unsafe { avx::_mm256_addsub_ps(a, b) };
|
||||
let e = f32x8::new(-4.0, 8.0, -4.0, 12.0, -4.0, 8.0, -4.0, 12.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -193,7 +192,7 @@ mod tests {
|
|||
fn _mm256_sub_pd() {
|
||||
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = avx::_mm256_sub_pd(a, b);
|
||||
let r = unsafe { avx::_mm256_sub_pd(a, b) };
|
||||
let e = f64x4::new(-4.0,-4.0,-4.0,-4.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -202,7 +201,7 @@ mod tests {
|
|||
fn _mm256_sub_ps() {
|
||||
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, -1.0, -2.0, -3.0, -4.0);
|
||||
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 3.0, 2.0, 1.0, 0.0);
|
||||
let r = avx::_mm256_sub_ps(a, b);
|
||||
let r = unsafe { avx::_mm256_sub_ps(a, b) };
|
||||
let e = f32x8::new(-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -210,9 +209,9 @@ mod tests {
|
|||
#[simd_test = "avx"]
|
||||
fn _mm256_round_pd() {
|
||||
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
|
||||
let result_closest = avx::_mm256_round_pd(a, 0b00000000);
|
||||
let result_down = avx::_mm256_round_pd(a, 0b00000001);
|
||||
let result_up = avx::_mm256_round_pd(a, 0b00000010);
|
||||
let result_closest = unsafe { avx::_mm256_round_pd(a, 0b00000000) };
|
||||
let result_down = unsafe { avx::_mm256_round_pd(a, 0b00000001) };
|
||||
let result_up = unsafe { avx::_mm256_round_pd(a, 0b00000010) };
|
||||
let expected_closest = f64x4::new(2.0, 2.0, 4.0, -1.0);
|
||||
let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
|
||||
let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
|
||||
|
|
@ -224,7 +223,7 @@ mod tests {
|
|||
#[simd_test = "avx"]
|
||||
fn _mm256_floor_pd() {
|
||||
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
|
||||
let result_down = avx::_mm256_floor_pd(a);
|
||||
let result_down = unsafe { avx::_mm256_floor_pd(a) };
|
||||
let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
|
||||
assert_eq!(result_down, expected_down);
|
||||
}
|
||||
|
|
@ -232,7 +231,7 @@ mod tests {
|
|||
#[simd_test = "avx"]
|
||||
fn _mm256_ceil_pd() {
|
||||
let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
|
||||
let result_up = avx::_mm256_ceil_pd(a, );
|
||||
let result_up = unsafe { avx::_mm256_ceil_pd(a) };
|
||||
let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
|
||||
assert_eq!(result_up, expected_up);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -10,20 +10,12 @@
|
|||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.x86.bmi.bextr.32"]
|
||||
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
|
||||
#[link_name="llvm.x86.bmi.bextr.64"]
|
||||
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
|
||||
}
|
||||
|
||||
/// Extracts bits in range [`start`, `start` + `length`) from `a` into
|
||||
/// the least significant bits of the result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
|
||||
pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
|
||||
_bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
|
||||
}
|
||||
|
||||
|
|
@ -33,7 +25,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
|
|||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
|
||||
pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
|
||||
_bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
|
||||
}
|
||||
|
||||
|
|
@ -45,8 +37,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
|
||||
unsafe { x86_bmi_bextr_32(a, control) }
|
||||
pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
|
||||
x86_bmi_bextr_32(a, control)
|
||||
}
|
||||
|
||||
/// Extracts bits of `a` specified by `control` into
|
||||
|
|
@ -58,15 +50,15 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
|
|||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
|
||||
unsafe { x86_bmi_bextr_64(a, control) }
|
||||
pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
|
||||
x86_bmi_bextr_64(a, control)
|
||||
}
|
||||
|
||||
/// Bitwise logical `AND` of inverted `a` with `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(andn))]
|
||||
pub fn _andn_u32(a: u32, b: u32) -> u32 {
|
||||
pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
|
||||
!a & b
|
||||
}
|
||||
|
||||
|
|
@ -74,7 +66,7 @@ pub fn _andn_u32(a: u32, b: u32) -> u32 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(andn))]
|
||||
pub fn _andn_u64(a: u64, b: u64) -> u64 {
|
||||
pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
|
||||
!a & b
|
||||
}
|
||||
|
||||
|
|
@ -82,7 +74,7 @@ pub fn _andn_u64(a: u64, b: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(blsi))]
|
||||
pub fn _blsi_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blsi_u32(x: u32) -> u32 {
|
||||
x & x.wrapping_neg()
|
||||
}
|
||||
|
||||
|
|
@ -91,7 +83,7 @@ pub fn _blsi_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(blsi))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blsi_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blsi_u64(x: u64) -> u64 {
|
||||
x & x.wrapping_neg()
|
||||
}
|
||||
|
||||
|
|
@ -99,7 +91,7 @@ pub fn _blsi_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(blsmsk))]
|
||||
pub fn _blsmsk_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
|
||||
x ^ (x.wrapping_sub(1u32))
|
||||
}
|
||||
|
||||
|
|
@ -108,7 +100,7 @@ pub fn _blsmsk_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(blsmsk))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blsmsk_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
|
||||
x ^ (x.wrapping_sub(1u64))
|
||||
}
|
||||
|
||||
|
|
@ -118,7 +110,7 @@ pub fn _blsmsk_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(blsr))]
|
||||
pub fn _blsr_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blsr_u32(x: u32) -> u32 {
|
||||
x & (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -129,7 +121,7 @@ pub fn _blsr_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(blsr))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blsr_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blsr_u64(x: u64) -> u64 {
|
||||
x & (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -139,7 +131,7 @@ pub fn _blsr_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(tzcnt))]
|
||||
pub fn _tzcnt_u16(x: u16) -> u16 {
|
||||
pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
|
||||
x.trailing_zeros() as u16
|
||||
}
|
||||
|
||||
|
|
@ -149,7 +141,7 @@ pub fn _tzcnt_u16(x: u16) -> u16 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(tzcnt))]
|
||||
pub fn _tzcnt_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
|
||||
x.trailing_zeros()
|
||||
}
|
||||
|
||||
|
|
@ -159,7 +151,7 @@ pub fn _tzcnt_u32(x: u32) -> u32 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(tzcnt))]
|
||||
pub fn _tzcnt_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
|
||||
x.trailing_zeros() as u64
|
||||
}
|
||||
|
||||
|
|
@ -169,7 +161,7 @@ pub fn _tzcnt_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(tzcnt))]
|
||||
pub fn _mm_tzcnt_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
|
||||
x.trailing_zeros()
|
||||
}
|
||||
|
||||
|
|
@ -179,10 +171,18 @@ pub fn _mm_tzcnt_u32(x: u32) -> u32 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(tzcnt))]
|
||||
pub fn _mm_tzcnt_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
|
||||
x.trailing_zeros() as u64
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.x86.bmi.bextr.32"]
|
||||
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
|
||||
#[link_name="llvm.x86.bmi.bextr.64"]
|
||||
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use stdsimd_test::simd_test;
|
||||
|
|
@ -191,98 +191,122 @@ mod tests {
|
|||
|
||||
#[simd_test = "bmi"]
|
||||
fn _bextr_u32() {
|
||||
assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
|
||||
let r = unsafe { bmi::_bextr_u32(0b0101_0000u32, 4, 4) };
|
||||
assert_eq!(r, 0b0000_0101u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _bextr_u64() {
|
||||
assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
|
||||
let r = unsafe { bmi::_bextr_u64(0b0101_0000u64, 4, 4) };
|
||||
assert_eq!(r, 0b0000_0101u64);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
fn _andn_u32() {
|
||||
assert_eq!(bmi::_andn_u32(0, 0), 0);
|
||||
assert_eq!(bmi::_andn_u32(0, 1), 1);
|
||||
assert_eq!(bmi::_andn_u32(1, 0), 0);
|
||||
assert_eq!(bmi::_andn_u32(1, 1), 0);
|
||||
assert_eq!(unsafe { bmi::_andn_u32(0, 0) }, 0);
|
||||
assert_eq!(unsafe { bmi::_andn_u32(0, 1) }, 1);
|
||||
assert_eq!(unsafe { bmi::_andn_u32(1, 0) }, 0);
|
||||
assert_eq!(unsafe { bmi::_andn_u32(1, 1) }, 0);
|
||||
|
||||
assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32);
|
||||
assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32);
|
||||
assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32);
|
||||
assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32);
|
||||
assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
|
||||
let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32) };
|
||||
assert_eq!(r, 0b0000_0000u32);
|
||||
|
||||
let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32) };
|
||||
assert_eq!(r, 0b1111_1111u32);
|
||||
|
||||
let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32) };
|
||||
assert_eq!(r, 0b0000_0000u32);
|
||||
|
||||
let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32) };
|
||||
assert_eq!(r, 0b0000_0000u32);
|
||||
|
||||
let r = unsafe { bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32) };
|
||||
assert_eq!(r, 0b0001_1101u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _andn_u64() {
|
||||
assert_eq!(bmi::_andn_u64(0, 0), 0);
|
||||
assert_eq!(bmi::_andn_u64(0, 1), 1);
|
||||
assert_eq!(bmi::_andn_u64(1, 0), 0);
|
||||
assert_eq!(bmi::_andn_u64(1, 1), 0);
|
||||
assert_eq!(unsafe { bmi::_andn_u64(0, 0) }, 0);
|
||||
assert_eq!(unsafe { bmi::_andn_u64(0, 1) }, 1);
|
||||
assert_eq!(unsafe { bmi::_andn_u64(1, 0) }, 0);
|
||||
assert_eq!(unsafe { bmi::_andn_u64(1, 1) }, 0);
|
||||
|
||||
assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64);
|
||||
assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64);
|
||||
assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64);
|
||||
assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64);
|
||||
assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
|
||||
let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64) };
|
||||
assert_eq!(r, 0b0000_0000u64);
|
||||
|
||||
let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64) };
|
||||
assert_eq!(r, 0b1111_1111u64);
|
||||
|
||||
let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64) };
|
||||
assert_eq!(r, 0b0000_0000u64);
|
||||
|
||||
let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64) };
|
||||
assert_eq!(r, 0b0000_0000u64);
|
||||
|
||||
let r = unsafe { bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64) };
|
||||
assert_eq!(r, 0b0001_1101u64);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
fn _blsi_u32() {
|
||||
assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
|
||||
assert_eq!(unsafe { bmi::_blsi_u32(0b1101_0000u32) }, 0b0001_0000u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blsi_u64() {
|
||||
assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
|
||||
assert_eq!(unsafe { bmi::_blsi_u64(0b1101_0000u64) }, 0b0001_0000u64);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
fn _blsmsk_u32() {
|
||||
assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
|
||||
let r = unsafe { bmi::_blsmsk_u32(0b0011_0000u32) };
|
||||
assert_eq!(r, 0b0001_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blsmsk_u64() {
|
||||
assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
|
||||
let r = unsafe { bmi::_blsmsk_u64(0b0011_0000u64) };
|
||||
assert_eq!(r, 0b0001_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
fn _blsr_u32() {
|
||||
/// TODO: test the behavior when the input is 0
|
||||
assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
|
||||
// TODO: test the behavior when the input is 0
|
||||
let r = unsafe { bmi::_blsr_u32(0b0011_0000u32) };
|
||||
assert_eq!(r, 0b0010_0000u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blsr_u64() {
|
||||
/// TODO: test the behavior when the input is 0
|
||||
assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
|
||||
// TODO: test the behavior when the input is 0
|
||||
let r = unsafe { bmi::_blsr_u64(0b0011_0000u64) };
|
||||
assert_eq!(r, 0b0010_0000u64);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
fn _tzcnt_u16() {
|
||||
assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
|
||||
assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
|
||||
assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0001u16) }, 0u16);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0000u16) }, 16u16);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u16(0b1001_0000u16) }, 4u16);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
fn _tzcnt_u32() {
|
||||
assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
|
||||
assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
|
||||
assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0001u32) }, 0u32);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0000u32) }, 32u32);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u32(0b1001_0000u32) }, 4u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _tzcnt_u64() {
|
||||
assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
|
||||
assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
|
||||
assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0001u64) }, 0u64);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0000u64) }, 64u64);
|
||||
assert_eq!(unsafe { bmi::_tzcnt_u64(0b1001_0000u64) }, 4u64);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ use stdsimd_test::assert_instr;
|
|||
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
|
||||
#[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
|
||||
#[target_feature = "+bmi2"]
|
||||
pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
|
||||
pub unsafe fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
|
||||
let result: u64 = (a as u64) * (b as u64);
|
||||
let hi = (result >> 32) as u32;
|
||||
(result as u32, hi)
|
||||
|
|
@ -33,12 +33,67 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
|
|||
#[cfg_attr(test, assert_instr(mulx))]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg(not(target_arch = "x86"))] // calls an intrinsic
|
||||
pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
|
||||
pub unsafe fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
|
||||
let result: u128 = (a as u128) * (b as u128);
|
||||
let hi = (result >> 64) as u64;
|
||||
(result as u64, hi)
|
||||
}
|
||||
|
||||
/// Zero higher bits of `a` >= `index`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(bzhi))]
|
||||
pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
|
||||
x86_bmi2_bzhi_32(a, index)
|
||||
}
|
||||
|
||||
/// Zero higher bits of `a` >= `index`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(bzhi))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 {
|
||||
x86_bmi2_bzhi_64(a, index)
|
||||
}
|
||||
|
||||
/// Scatter contiguous low order bits of `a` to the result at the positions
|
||||
/// specified by the `mask`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pdep))]
|
||||
pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
|
||||
x86_bmi2_pdep_32(a, mask)
|
||||
}
|
||||
|
||||
/// Scatter contiguous low order bits of `a` to the result at the positions
|
||||
/// specified by the `mask`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pdep))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
|
||||
x86_bmi2_pdep_64(a, mask)
|
||||
}
|
||||
|
||||
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
|
||||
/// order bit positions of the result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pext))]
|
||||
pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
|
||||
x86_bmi2_pext_32(a, mask)
|
||||
}
|
||||
|
||||
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
|
||||
/// order bit positions of the result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pext))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
|
||||
x86_bmi2_pext_64(a, mask)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.x86.bmi.bzhi.32"]
|
||||
|
|
@ -55,63 +110,6 @@ extern "C" {
|
|||
fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
|
||||
}
|
||||
|
||||
|
||||
/// Zero higher bits of `a` >= `index`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(bzhi))]
|
||||
pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
|
||||
unsafe { x86_bmi2_bzhi_32(a, index) }
|
||||
}
|
||||
|
||||
/// Zero higher bits of `a` >= `index`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(bzhi))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub fn _bzhi_u64(a: u64, index: u64) -> u64 {
|
||||
unsafe { x86_bmi2_bzhi_64(a, index) }
|
||||
}
|
||||
|
||||
|
||||
/// Scatter contiguous low order bits of `a` to the result at the positions
|
||||
/// specified by the `mask`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pdep))]
|
||||
pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
|
||||
unsafe { x86_bmi2_pdep_32(a, mask) }
|
||||
}
|
||||
|
||||
/// Scatter contiguous low order bits of `a` to the result at the positions
|
||||
/// specified by the `mask`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pdep))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
|
||||
unsafe { x86_bmi2_pdep_64(a, mask) }
|
||||
}
|
||||
|
||||
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
|
||||
/// order bit positions of the result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pext))]
|
||||
pub fn _pext_u32(a: u32, mask: u32) -> u32 {
|
||||
unsafe { x86_bmi2_pext_32(a, mask) }
|
||||
}
|
||||
|
||||
/// Gathers the bits of `x` specified by the `mask` into the contiguous low
|
||||
/// order bit positions of the result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi2"]
|
||||
#[cfg_attr(test, assert_instr(pext))]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
pub fn _pext_u64(a: u64, mask: u64) -> u64 {
|
||||
unsafe { x86_bmi2_pext_64(a, mask) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use stdsimd_test::simd_test;
|
||||
|
|
@ -128,8 +126,8 @@ mod tests {
|
|||
let m1 = 0b1110_1011_1110_1111u32;
|
||||
let s1 = 0b0001_0111_0100_0011u32;
|
||||
|
||||
assert_eq!(bmi2::_pext_u32(n, m0), s0);
|
||||
assert_eq!(bmi2::_pext_u32(n, m1), s1);
|
||||
assert_eq!(unsafe { bmi2::_pext_u32(n, m0) }, s0);
|
||||
assert_eq!(unsafe { bmi2::_pext_u32(n, m1) }, s1);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
|
|
@ -143,8 +141,8 @@ mod tests {
|
|||
let m1 = 0b1110_1011_1110_1111u64;
|
||||
let s1 = 0b0001_0111_0100_0011u64;
|
||||
|
||||
assert_eq!(bmi2::_pext_u64(n, m0), s0);
|
||||
assert_eq!(bmi2::_pext_u64(n, m1), s1);
|
||||
assert_eq!(unsafe { bmi2::_pext_u64(n, m0) }, s0);
|
||||
assert_eq!(unsafe { bmi2::_pext_u64(n, m1) }, s1);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
|
|
@ -157,8 +155,8 @@ mod tests {
|
|||
let m1 = 0b1110_1011_1110_1111u32;
|
||||
let s1 = 0b1110_1001_0010_0011u32;
|
||||
|
||||
assert_eq!(bmi2::_pdep_u32(n, m0), s0);
|
||||
assert_eq!(bmi2::_pdep_u32(n, m1), s1);
|
||||
assert_eq!(unsafe { bmi2::_pdep_u32(n, m0) }, s0);
|
||||
assert_eq!(unsafe { bmi2::_pdep_u32(n, m1) }, s1);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
|
|
@ -172,15 +170,15 @@ mod tests {
|
|||
let m1 = 0b1110_1011_1110_1111u64;
|
||||
let s1 = 0b1110_1001_0010_0011u64;
|
||||
|
||||
assert_eq!(bmi2::_pdep_u64(n, m0), s0);
|
||||
assert_eq!(bmi2::_pdep_u64(n, m1), s1);
|
||||
assert_eq!(unsafe { bmi2::_pdep_u64(n, m0) }, s0);
|
||||
assert_eq!(unsafe { bmi2::_pdep_u64(n, m1) }, s1);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
fn _bzhi_u32() {
|
||||
let n = 0b1111_0010u32;
|
||||
let s = 0b0001_0010u32;
|
||||
assert_eq!(bmi2::_bzhi_u32(n, 5), s);
|
||||
assert_eq!(unsafe { bmi2::_bzhi_u32(n, 5) }, s);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
|
|
@ -188,14 +186,14 @@ mod tests {
|
|||
fn _bzhi_u64() {
|
||||
let n = 0b1111_0010u64;
|
||||
let s = 0b0001_0010u64;
|
||||
assert_eq!(bmi2::_bzhi_u64(n, 5), s);
|
||||
assert_eq!(unsafe { bmi2::_bzhi_u64(n, 5) }, s);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
fn _mulx_u32() {
|
||||
let a: u32 = 4_294_967_200;
|
||||
let b: u32 = 2;
|
||||
let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b);
|
||||
let (lo, hi): (u32, u32) = unsafe { bmi2::_mulx_u32(a, b) };
|
||||
// result = 8589934400
|
||||
// = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
|
||||
// ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -208,7 +206,7 @@ mod tests {
|
|||
fn _mulx_u64() {
|
||||
let a: u64 = 9_223_372_036_854_775_800;
|
||||
let b: u64 = 100;
|
||||
let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b);
|
||||
let (lo, hi): (u64, u64) = unsafe { bmi2::_mulx_u64(a, b) };
|
||||
// result = 922337203685477580000
|
||||
// = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
|
||||
// ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
|
|||
|
|
@ -9,15 +9,15 @@ use stdsimd_test::assert_instr;
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(addss))]
|
||||
pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { addss(a, b) }
|
||||
pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
addss(a, b)
|
||||
}
|
||||
|
||||
/// Adds f32x4 vectors.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(addps))]
|
||||
pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
pub unsafe fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
a + b
|
||||
}
|
||||
|
||||
|
|
@ -26,15 +26,15 @@ pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(subss))]
|
||||
pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { subss(a, b) }
|
||||
pub unsafe fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
subss(a, b)
|
||||
}
|
||||
|
||||
/// Subtracts f32x4 vectors.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(subps))]
|
||||
pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
pub unsafe fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
a - b
|
||||
}
|
||||
|
||||
|
|
@ -43,15 +43,15 @@ pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(mulss))]
|
||||
pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { mulss(a, b) }
|
||||
pub unsafe fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
mulss(a, b)
|
||||
}
|
||||
|
||||
/// Multiplies f32x4 vectors.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(mulps))]
|
||||
pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
pub unsafe fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
a * b
|
||||
}
|
||||
|
||||
|
|
@ -60,15 +60,15 @@ pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(divss))]
|
||||
pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { divss(a, b) }
|
||||
pub unsafe fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
divss(a, b)
|
||||
}
|
||||
|
||||
/// Divides f32x4 vectors.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(divps))]
|
||||
pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
pub unsafe fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
a / b
|
||||
}
|
||||
|
||||
|
|
@ -77,8 +77,8 @@ pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(sqrtss))]
|
||||
pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
|
||||
unsafe { sqrtss(a) }
|
||||
pub unsafe fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
|
||||
sqrtss(a)
|
||||
}
|
||||
|
||||
/// Return the square root of packed single-precision (32-bit) floating-point
|
||||
|
|
@ -86,8 +86,8 @@ pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(sqrtps))]
|
||||
pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
|
||||
unsafe { sqrtps(a) }
|
||||
pub unsafe fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
|
||||
sqrtps(a)
|
||||
}
|
||||
|
||||
/// Return the approximate reciprocal of the first single-precision
|
||||
|
|
@ -95,8 +95,8 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(rcpss))]
|
||||
pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
|
||||
unsafe { rcpss(a) }
|
||||
pub unsafe fn _mm_rcp_ss(a: f32x4) -> f32x4 {
|
||||
rcpss(a)
|
||||
}
|
||||
|
||||
/// Return the approximate reciprocal of packed single-precision (32-bit)
|
||||
|
|
@ -104,8 +104,8 @@ pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(rcpps))]
|
||||
pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
|
||||
unsafe { rcpps(a) }
|
||||
pub unsafe fn _mm_rcp_ps(a: f32x4) -> f32x4 {
|
||||
rcpps(a)
|
||||
}
|
||||
|
||||
/// Return the approximate reciprocal square root of the fist single-precision
|
||||
|
|
@ -113,8 +113,8 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(rsqrtss))]
|
||||
pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
|
||||
unsafe { rsqrtss(a) }
|
||||
pub unsafe fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
|
||||
rsqrtss(a)
|
||||
}
|
||||
|
||||
/// Return the approximate reciprocal square root of packed single-precision
|
||||
|
|
@ -122,8 +122,8 @@ pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(rsqrtps))]
|
||||
pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
|
||||
unsafe { rsqrtps(a) }
|
||||
pub unsafe fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
|
||||
rsqrtps(a)
|
||||
}
|
||||
|
||||
/// Compare the first single-precision (32-bit) floating-point element of `a`
|
||||
|
|
@ -132,8 +132,8 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(minss))]
|
||||
pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { minss(a, b) }
|
||||
pub unsafe fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
minss(a, b)
|
||||
}
|
||||
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
|
||||
|
|
@ -141,8 +141,8 @@ pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(minps))]
|
||||
pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { minps(a, b) }
|
||||
pub unsafe fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
minps(a, b)
|
||||
}
|
||||
|
||||
/// Compare the first single-precision (32-bit) floating-point element of `a`
|
||||
|
|
@ -151,8 +151,8 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(maxss))]
|
||||
pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { maxss(a, b) }
|
||||
pub unsafe fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
|
||||
maxss(a, b)
|
||||
}
|
||||
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
|
||||
|
|
@ -160,24 +160,23 @@ pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(maxps))]
|
||||
pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { maxps(a, b) }
|
||||
pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
maxps(a, b)
|
||||
}
|
||||
|
||||
// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
|
||||
// using `mask`.
|
||||
// The lower half of result takes values from `a` and the higher half from `b`.
|
||||
// Mask is split to 2 control bits each to index the element from inputs.
|
||||
/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
|
||||
/// `b` using `mask`.
|
||||
///
|
||||
/// The lower half of result takes values from `a` and the higher half from
|
||||
/// `b`. Mask is split to 2 control bits each to index the element from inputs.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
|
||||
pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
|
||||
let mask = (mask & 0xFF) as u8;
|
||||
|
||||
macro_rules! shuffle_done {
|
||||
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
|
||||
unsafe {
|
||||
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
|
||||
}
|
||||
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x67 {
|
||||
|
|
@ -219,10 +218,10 @@ pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
|
|||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[cfg_attr(test, assert_instr(shufps))]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(shufps))]
|
||||
fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
_mm_shuffle_ps(a, b, 3)
|
||||
unsafe { _mm_shuffle_ps(a, b, 3) }
|
||||
}
|
||||
|
||||
/// Unpack and interleave single-precision (32-bit) floating-point elements
|
||||
|
|
@ -230,8 +229,8 @@ fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(unpckhps))]
|
||||
pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
|
||||
pub unsafe fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
simd_shuffle4(a, b, [2, 6, 3, 7])
|
||||
}
|
||||
|
||||
/// Unpack and interleave single-precision (32-bit) floating-point elements
|
||||
|
|
@ -239,8 +238,8 @@ pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(unpcklps))]
|
||||
pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
|
||||
pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
simd_shuffle4(a, b, [0, 4, 1, 5])
|
||||
}
|
||||
|
||||
/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
|
||||
|
|
@ -249,9 +248,9 @@ pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
|
||||
#[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
|
||||
pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
// TODO; figure why this is a different instruction on Windows?
|
||||
unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
|
||||
simd_shuffle4(a, b, [6, 7, 2, 3])
|
||||
}
|
||||
|
||||
/// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
|
||||
|
|
@ -259,8 +258,8 @@ pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(unpcklpd))]
|
||||
pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
|
||||
pub unsafe fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
simd_shuffle4(a, b, [0, 1, 4, 5])
|
||||
}
|
||||
|
||||
/// Return a mask of the most significant bit of each element in `a`.
|
||||
|
|
@ -270,8 +269,8 @@ pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(movmskps))]
|
||||
pub fn _mm_movemask_ps(a: f32x4) -> i32 {
|
||||
unsafe { movmskps(a) }
|
||||
pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
|
||||
movmskps(a)
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
|
|
@ -318,7 +317,7 @@ mod tests {
|
|||
fn _mm_add_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_add_ps(a, b);
|
||||
let r = unsafe { sse::_mm_add_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
|
||||
}
|
||||
|
||||
|
|
@ -326,7 +325,7 @@ mod tests {
|
|||
fn _mm_add_ss() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_add_ss(a, b);
|
||||
let r = unsafe { sse::_mm_add_ss(a, b) };
|
||||
assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
|
|
@ -334,7 +333,7 @@ mod tests {
|
|||
fn _mm_sub_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_sub_ps(a, b);
|
||||
let r = unsafe { sse::_mm_sub_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
|
||||
}
|
||||
|
||||
|
|
@ -342,7 +341,7 @@ mod tests {
|
|||
fn _mm_sub_ss() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_sub_ss(a, b);
|
||||
let r = unsafe { sse::_mm_sub_ss(a, b) };
|
||||
assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
|
|
@ -350,7 +349,7 @@ mod tests {
|
|||
fn _mm_mul_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_mul_ps(a, b);
|
||||
let r = unsafe { sse::_mm_mul_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
|
||||
}
|
||||
|
||||
|
|
@ -358,7 +357,7 @@ mod tests {
|
|||
fn _mm_mul_ss() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_mul_ss(a, b);
|
||||
let r = unsafe { sse::_mm_mul_ss(a, b) };
|
||||
assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
|
|
@ -366,7 +365,7 @@ mod tests {
|
|||
fn _mm_div_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
|
||||
let r = sse::_mm_div_ps(a, b);
|
||||
let r = unsafe { sse::_mm_div_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
|
||||
}
|
||||
|
||||
|
|
@ -374,14 +373,14 @@ mod tests {
|
|||
fn _mm_div_ss() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_div_ss(a, b);
|
||||
let r = unsafe { sse::_mm_div_ss(a, b) };
|
||||
assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
#[simd_test = "sse"]
|
||||
fn _mm_sqrt_ss() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_sqrt_ss(a);
|
||||
let r = unsafe { sse::_mm_sqrt_ss(a) };
|
||||
let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -389,7 +388,7 @@ mod tests {
|
|||
#[simd_test = "sse"]
|
||||
fn _mm_sqrt_ps() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_sqrt_ps(a);
|
||||
let r = unsafe { sse::_mm_sqrt_ps(a) };
|
||||
let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -397,7 +396,7 @@ mod tests {
|
|||
#[simd_test = "sse"]
|
||||
fn _mm_rcp_ss() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_rcp_ss(a);
|
||||
let r = unsafe { sse::_mm_rcp_ss(a) };
|
||||
let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -405,7 +404,7 @@ mod tests {
|
|||
#[simd_test = "sse"]
|
||||
fn _mm_rcp_ps() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_rcp_ps(a);
|
||||
let r = unsafe { sse::_mm_rcp_ps(a) };
|
||||
let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -413,7 +412,7 @@ mod tests {
|
|||
#[simd_test = "sse"]
|
||||
fn _mm_rsqrt_ss() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_rsqrt_ss(a);
|
||||
let r = unsafe { sse::_mm_rsqrt_ss(a) };
|
||||
let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -421,7 +420,7 @@ mod tests {
|
|||
#[simd_test = "sse"]
|
||||
fn _mm_rsqrt_ps() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_rsqrt_ps(a);
|
||||
let r = unsafe { sse::_mm_rsqrt_ps(a) };
|
||||
let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -430,7 +429,7 @@ mod tests {
|
|||
fn _mm_min_ss() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_min_ss(a, b);
|
||||
let r = unsafe { sse::_mm_min_ss(a, b) };
|
||||
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
|
|
@ -438,7 +437,7 @@ mod tests {
|
|||
fn _mm_min_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_min_ps(a, b);
|
||||
let r = unsafe { sse::_mm_min_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
|
|
@ -446,7 +445,7 @@ mod tests {
|
|||
fn _mm_max_ss() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_max_ss(a, b);
|
||||
let r = unsafe { sse::_mm_max_ss(a, b) };
|
||||
assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
|
|
@ -454,7 +453,7 @@ mod tests {
|
|||
fn _mm_max_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_max_ps(a, b);
|
||||
let r = unsafe { sse::_mm_max_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
|
||||
}
|
||||
|
||||
|
|
@ -463,7 +462,7 @@ mod tests {
|
|||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let mask = 0b00_01_01_11;
|
||||
let r = sse::_mm_shuffle_ps(a, b, mask);
|
||||
let r = unsafe { sse::_mm_shuffle_ps(a, b, mask) };
|
||||
assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
|
||||
}
|
||||
|
||||
|
|
@ -471,7 +470,7 @@ mod tests {
|
|||
fn _mm_unpackhi_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_unpackhi_ps(a, b);
|
||||
let r = unsafe { sse::_mm_unpackhi_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
|
||||
}
|
||||
|
||||
|
|
@ -479,7 +478,7 @@ mod tests {
|
|||
fn _mm_unpacklo_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_unpacklo_ps(a, b);
|
||||
let r = unsafe { sse::_mm_unpacklo_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
|
||||
}
|
||||
|
||||
|
|
@ -487,7 +486,7 @@ mod tests {
|
|||
fn _mm_movehl_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_movehl_ps(a, b);
|
||||
let r = unsafe { sse::_mm_movehl_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
|
||||
}
|
||||
|
||||
|
|
@ -495,16 +494,20 @@ mod tests {
|
|||
fn _mm_movelh_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_movelh_ps(a, b);
|
||||
let r = unsafe { sse::_mm_movelh_ps(a, b) };
|
||||
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
|
||||
}
|
||||
|
||||
#[simd_test = "sse"]
|
||||
fn _mm_movemask_ps() {
|
||||
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
|
||||
let r = unsafe {
|
||||
sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0))
|
||||
};
|
||||
assert_eq!(r, 0b0101);
|
||||
|
||||
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
|
||||
let r = unsafe {
|
||||
sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0))
|
||||
};
|
||||
assert_eq!(r, 0b0111);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,18 +1,18 @@
|
|||
use v128::*;
|
||||
use x86::__m128i;
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
use v128::*;
|
||||
use x86::__m128i;
|
||||
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pblendvb))]
|
||||
pub fn _mm_blendv_epi8(
|
||||
pub unsafe fn _mm_blendv_epi8(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
mask: __m128i,
|
||||
) -> __m128i {
|
||||
unsafe { pblendvb(a, b, mask) }
|
||||
pblendvb(a, b, mask)
|
||||
}
|
||||
|
||||
/// Returns the dot product of two f64x2 vectors.
|
||||
|
|
@ -24,15 +24,20 @@ pub fn _mm_blendv_epi8(
|
|||
/// the broadcast mask bit is zero then the return component will be zero.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
|
||||
pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
unsafe { dppd(a, b, $imm8) }
|
||||
}
|
||||
($imm8:expr) => { dppd(a, b, $imm8) }
|
||||
}
|
||||
constify_imm8!(imm8, call)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(dppd))]
|
||||
fn _test_mm_dp_pd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
unsafe { _mm_dp_pd(a, b, 0) }
|
||||
}
|
||||
|
||||
/// Returns the dot product of two f32x4 vectors.
|
||||
///
|
||||
/// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
|
||||
|
|
@ -42,15 +47,20 @@ pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
|
|||
/// the broadcast mask bit is zero then the return component will be zero.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
pub fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
|
||||
pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
unsafe { dpps(a, b, $imm8) }
|
||||
}
|
||||
($imm8:expr) => { dpps(a, b, $imm8) }
|
||||
}
|
||||
constify_imm8!(imm8, call)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(dpps))]
|
||||
fn _test_mm_dp_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { _mm_dp_ps(a, b, 0) }
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
#[link_name = "llvm.x86.sse41.pblendvb"]
|
||||
|
|
@ -78,7 +88,7 @@ mod tests {
|
|||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
|
||||
let e = i8x16::new(
|
||||
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
|
||||
assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
|
||||
assert_eq!(unsafe { sse41::_mm_blendv_epi8(a, b, mask) }, e);
|
||||
}
|
||||
|
||||
#[simd_test = "sse4.1"]
|
||||
|
|
@ -86,7 +96,7 @@ mod tests {
|
|||
let a = f64x2::new(2.0, 3.0);
|
||||
let b = f64x2::new(1.0, 4.0);
|
||||
let e = f64x2::new(14.0, 0.0);
|
||||
assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
|
||||
assert_eq!(unsafe { sse41::_mm_dp_pd(a, b, 0b00110001) }, e);
|
||||
}
|
||||
|
||||
#[simd_test = "sse4.1"]
|
||||
|
|
@ -94,6 +104,6 @@ mod tests {
|
|||
let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
|
||||
let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
|
||||
let e = f32x4::new(14.5, 0.0, 14.5, 0.0);
|
||||
assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e);
|
||||
assert_eq!(unsafe { sse41::_mm_dp_ps(a, b, 0b01110101) }, e);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,6 @@
|
|||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
use x86::__m128i;
|
||||
|
||||
pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;
|
||||
|
|
@ -19,7 +22,7 @@ pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b01000000;
|
|||
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
pub fn _mm_cmpestri(
|
||||
pub unsafe fn _mm_cmpestri(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
|
|
@ -27,13 +30,18 @@ pub fn _mm_cmpestri(
|
|||
imm8: i8,
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
unsafe { pcmpestri128(a, la, b, lb, $imm8) }
|
||||
}
|
||||
($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
|
||||
}
|
||||
constify_imm8!(imm8, call)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri))]
|
||||
fn _test_mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
|
||||
unsafe { _mm_cmpestri(a, la, b, lb, 0) }
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
#[link_name = "llvm.x86.sse42.pcmpestri128"]
|
||||
|
|
@ -53,8 +61,10 @@ mod tests {
|
|||
let b = &b"foobar "[..];
|
||||
let va = __m128i::from(u8x16::load(a, 0));
|
||||
let vb = __m128i::from(u8x16::load(b, 0));
|
||||
let i = sse42::_mm_cmpestri(
|
||||
va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
|
||||
let i = unsafe {
|
||||
sse42::_mm_cmpestri(
|
||||
va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED)
|
||||
};
|
||||
assert_eq!(3, i);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
use v128::*;
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
use v128::*;
|
||||
|
||||
/// Compute the absolute value of packed 8-bit signed integers in `a` and
|
||||
/// return the unsigned results.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+ssse3"]
|
||||
#[cfg_attr(test, assert_instr(pabsb))]
|
||||
pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
|
||||
unsafe { pabsb128(a) }
|
||||
pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
|
||||
pabsb128(a)
|
||||
}
|
||||
|
||||
/// Shuffle bytes from `a` according to the content of `b`.
|
||||
|
|
@ -39,8 +39,8 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+ssse3"]
|
||||
#[cfg_attr(test, assert_instr(pshufb))]
|
||||
pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
|
||||
unsafe { pshufb128(a, b) }
|
||||
pub unsafe fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
|
||||
pshufb128(a, b)
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -48,7 +48,6 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
|
|||
extern {
|
||||
#[link_name = "llvm.x86.ssse3.pabs.b.128"]
|
||||
fn pabsb128(a: i8x16) -> u8x16;
|
||||
|
||||
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
|
||||
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
|
||||
}
|
||||
|
|
@ -62,16 +61,31 @@ mod tests {
|
|||
|
||||
#[simd_test = "ssse3"]
|
||||
fn _mm_abs_epi8() {
|
||||
let r = ssse3::_mm_abs_epi8(i8x16::splat(-5));
|
||||
let r = unsafe { ssse3::_mm_abs_epi8(i8x16::splat(-5)) };
|
||||
assert_eq!(r, u8x16::splat(5));
|
||||
}
|
||||
|
||||
#[simd_test = "ssse3"]
|
||||
fn _mm_shuffle_epi8() {
|
||||
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let expected = u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
|
||||
let r = ssse3::_mm_shuffle_epi8(a, b);
|
||||
let a = u8x16::new(
|
||||
1, 2, 3, 4,
|
||||
5, 6, 7, 8,
|
||||
9, 10, 11, 12,
|
||||
13, 14, 15, 16,
|
||||
);
|
||||
let b = u8x16::new(
|
||||
4, 128, 4, 3,
|
||||
24, 12, 6, 19,
|
||||
12, 5, 5, 10,
|
||||
4, 1, 8, 0,
|
||||
);
|
||||
let expected = u8x16::new(
|
||||
5, 0, 5, 4,
|
||||
9, 13, 7, 4,
|
||||
13, 6, 6, 11,
|
||||
5, 2, 9, 1,
|
||||
);
|
||||
let r = unsafe { ssse3::_mm_shuffle_epi8(a, b) };
|
||||
assert_eq!(r, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcfill))]
|
||||
pub fn _blcfill_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blcfill_u32(x: u32) -> u32 {
|
||||
x & (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -76,7 +76,7 @@ pub fn _blcfill_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcfill))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blcfill_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blcfill_u64(x: u64) -> u64 {
|
||||
x & (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ pub fn _blcfill_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blci))]
|
||||
pub fn _blci_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blci_u32(x: u32) -> u32 {
|
||||
x | !(x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -97,7 +97,7 @@ pub fn _blci_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blci))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blci_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blci_u64(x: u64) -> u64 {
|
||||
x | !(x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -107,7 +107,7 @@ pub fn _blci_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcic))]
|
||||
pub fn _blcic_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blcic_u32(x: u32) -> u32 {
|
||||
!x & (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -118,7 +118,7 @@ pub fn _blcic_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcic))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blcic_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blcic_u64(x: u64) -> u64 {
|
||||
!x & (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -128,7 +128,7 @@ pub fn _blcic_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcmsk))]
|
||||
pub fn _blcmsk_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
|
||||
x ^ (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -139,7 +139,7 @@ pub fn _blcmsk_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcmsk))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blcmsk_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
|
||||
x ^ (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -149,7 +149,7 @@ pub fn _blcmsk_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcs))]
|
||||
pub fn _blcs_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blcs_u32(x: u32) -> u32 {
|
||||
x | (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -160,7 +160,7 @@ pub fn _blcs_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blcs))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blcs_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blcs_u64(x: u64) -> u64 {
|
||||
x | x.wrapping_add(1)
|
||||
}
|
||||
|
||||
|
|
@ -170,7 +170,7 @@ pub fn _blcs_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blsfill))]
|
||||
pub fn _blsfill_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blsfill_u32(x: u32) -> u32 {
|
||||
x | (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -181,7 +181,7 @@ pub fn _blsfill_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blsfill))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blsfill_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blsfill_u64(x: u64) -> u64 {
|
||||
x | (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -191,7 +191,7 @@ pub fn _blsfill_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blsic))]
|
||||
pub fn _blsic_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _blsic_u32(x: u32) -> u32 {
|
||||
!x | (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -202,7 +202,7 @@ pub fn _blsic_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(blsic))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _blsic_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _blsic_u64(x: u64) -> u64 {
|
||||
!x | (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -213,7 +213,7 @@ pub fn _blsic_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(t1mskc))]
|
||||
pub fn _t1mskc_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
|
||||
!x | (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -225,7 +225,7 @@ pub fn _t1mskc_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(t1mskc))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _t1mskc_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
|
||||
!x | (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
|
|
@ -236,7 +236,7 @@ pub fn _t1mskc_u64(x: u64) -> u64 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(tzmsk))]
|
||||
pub fn _tzmsk_u32(x: u32) -> u32 {
|
||||
pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
|
||||
!x & (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -248,7 +248,7 @@ pub fn _tzmsk_u32(x: u32) -> u32 {
|
|||
#[target_feature = "+tbm"]
|
||||
#[cfg_attr(test, assert_instr(tzmsk))]
|
||||
#[cfg(not(target_arch = "x86"))] // generates lots of instructions
|
||||
pub fn _tzmsk_u64(x: u64) -> u64 {
|
||||
pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
|
||||
!x & (x.wrapping_sub(1))
|
||||
}
|
||||
|
||||
|
|
@ -272,122 +272,174 @@ mod tests {
|
|||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blcfill_u32() {
|
||||
assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
|
||||
assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcfill_u32(0b0101_0111u32) },
|
||||
0b0101_0000u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcfill_u32(0b1111_1111u32) },
|
||||
0u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blcfill_u64() {
|
||||
assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
|
||||
assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcfill_u64(0b0101_0111u64) },
|
||||
0b0101_0000u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcfill_u64(0b1111_1111u64) },
|
||||
0u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blci_u32() {
|
||||
assert_eq!(tbm::_blci_u32(0b0101_0000u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
|
||||
assert_eq!(tbm::_blci_u32(0b1111_1111u32),
|
||||
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blci_u32(0b0101_0000u32) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blci_u32(0b1111_1111u32) },
|
||||
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blci_u64() {
|
||||
assert_eq!(tbm::_blci_u64(0b0101_0000u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
|
||||
assert_eq!(tbm::_blci_u64(0b1111_1111u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blci_u64(0b0101_0000u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blci_u64(0b1111_1111u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blcic_u32() {
|
||||
assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
|
||||
assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcic_u32(0b0101_0001u32) },
|
||||
0b0000_0010u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcic_u32(0b1111_1111u32) },
|
||||
0b1_0000_0000u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blcic_u64() {
|
||||
assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
|
||||
assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcic_u64(0b0101_0001u64) },
|
||||
0b0000_0010u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcic_u64(0b1111_1111u64) },
|
||||
0b1_0000_0000u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blcmsk_u32() {
|
||||
assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
|
||||
assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcmsk_u32(0b0101_0001u32) },
|
||||
0b0000_0011u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcmsk_u32(0b1111_1111u32) },
|
||||
0b1_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blcmsk_u64() {
|
||||
assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
|
||||
assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcmsk_u64(0b0101_0001u64) },
|
||||
0b0000_0011u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blcmsk_u64(0b1111_1111u64) },
|
||||
0b1_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blcs_u32() {
|
||||
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
|
||||
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
|
||||
assert_eq!(unsafe { tbm::_blcs_u32(0b0101_0001u32) }, 0b0101_0011u32);
|
||||
assert_eq!(unsafe { tbm::_blcs_u32(0b1111_1111u32) }, 0b1_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blcs_u64() {
|
||||
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
|
||||
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
|
||||
assert_eq!(unsafe { tbm::_blcs_u64(0b0101_0001u64) }, 0b0101_0011u64);
|
||||
assert_eq!(unsafe { tbm::_blcs_u64(0b1111_1111u64) }, 0b1_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blsfill_u32() {
|
||||
assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
|
||||
assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsfill_u32(0b0101_0100u32) },
|
||||
0b0101_0111u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsfill_u32(0u32) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blsfill_u64() {
|
||||
assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
|
||||
assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsfill_u64(0b0101_0100u64) },
|
||||
0b0101_0111u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsfill_u64(0u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _blsic_u32() {
|
||||
assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
|
||||
assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsic_u32(0b0101_0100u32) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1011u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsic_u32(0u32) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _blsic_u64() {
|
||||
assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
|
||||
assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsic_u64(0b0101_0100u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_blsic_u64(0u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _t1mskc_u32() {
|
||||
assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
|
||||
assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_t1mskc_u32(0b0101_0111u32) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1000u32);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_t1mskc_u32(0u32) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _t1mksc_u64() {
|
||||
assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
|
||||
assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_t1mskc_u64(0b0101_0111u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
|
||||
assert_eq!(
|
||||
unsafe { tbm::_t1mskc_u64(0u64) },
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
fn _tzmsk_u32() {
|
||||
assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
|
||||
assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
|
||||
assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1000u32) }, 0b0000_0111u32);
|
||||
assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1001u32) }, 0b0000_0000u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
fn _tzmsk_u64() {
|
||||
assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
|
||||
assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
|
||||
assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1000u64) }, 0b0000_0111u64);
|
||||
assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1001u64) }, 0b0000_0000u64);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue