diff --git a/library/stdarch/examples/play.rs b/library/stdarch/examples/play.rs index 69b692fe12ea..81a9d9b67ad8 100644 --- a/library/stdarch/examples/play.rs +++ b/library/stdarch/examples/play.rs @@ -24,9 +24,11 @@ mod example { haystack.resize(16, 0); let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0)); - vendor::_mm_cmpestri( - vneedle, needle_len as i32, vhaystack, hay_len as i32, - vendor::_SIDD_CMP_EQUAL_ORDERED) as usize + unsafe { + vendor::_mm_cmpestri( + vneedle, needle_len as i32, vhaystack, hay_len as i32, + vendor::_SIDD_CMP_EQUAL_ORDERED) as usize + } } pub fn main() { diff --git a/library/stdarch/src/x86/abm.rs b/library/stdarch/src/x86/abm.rs index de47f7fcb6d0..ea779fabfc2e 100644 --- a/library/stdarch/src/x86/abm.rs +++ b/library/stdarch/src/x86/abm.rs @@ -19,7 +19,7 @@ use stdsimd_test::assert_instr; #[inline(always)] #[target_feature = "+lzcnt"] #[cfg_attr(test, assert_instr(lzcnt))] -pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() } +pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() } /// Counts the leading most significant zero bits. /// @@ -27,19 +27,19 @@ pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() } #[inline(always)] #[target_feature = "+lzcnt"] #[cfg_attr(test, assert_instr(lzcnt))] -pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 } +pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 } /// Counts the bits that are set. #[inline(always)] #[target_feature = "+popcnt"] #[cfg_attr(test, assert_instr(popcnt))] -pub fn _popcnt32(x: u32) -> u32 { x.count_ones() } +pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() } /// Counts the bits that are set. #[inline(always)] #[target_feature = "+popcnt"] #[cfg_attr(test, assert_instr(popcnt))] -pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 } +pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 } #[cfg(test)] mod tests { @@ -49,21 +49,21 @@ mod tests { #[simd_test = "lzcnt"] fn _lzcnt_u32() { - assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32); + assert_eq!(unsafe { abm::_lzcnt_u32(0b0101_1010u32) }, 25u32); } #[simd_test = "lzcnt"] fn _lzcnt_u64() { - assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64); + assert_eq!(unsafe { abm::_lzcnt_u64(0b0101_1010u64) }, 57u64); } #[simd_test = "popcnt"] fn _popcnt32() { - assert_eq!(abm::_popcnt32(0b0101_1010u32), 4); + assert_eq!(unsafe { abm::_popcnt32(0b0101_1010u32) }, 4); } #[simd_test = "popcnt"] fn _popcnt64() { - assert_eq!(abm::_popcnt64(0b0101_1010u64), 4); + assert_eq!(unsafe { abm::_popcnt64(0b0101_1010u64) }, 4); } } diff --git a/library/stdarch/src/x86/avx.rs b/library/stdarch/src/x86/avx.rs index 10cda2c111b4..e53f5670043f 100644 --- a/library/stdarch/src/x86/avx.rs +++ b/library/stdarch/src/x86/avx.rs @@ -1,14 +1,14 @@ -use v256::*; - #[cfg(test)] use stdsimd_test::assert_instr; +use v256::*; + /// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vaddpd))] -pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { +pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { a + b } @@ -16,7 +16,7 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vaddps))] -pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { +pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { a + b } @@ -25,7 +25,7 @@ pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmulpd))] -pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 { +pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 { a * b } @@ -33,7 +33,7 @@ pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vmulps))] -pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 { +pub unsafe fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 { a * b } @@ -42,8 +42,8 @@ pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vaddsubpd))] -pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 { - unsafe { addsubpd256(a, b) } +pub unsafe fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 { + addsubpd256(a, b) } /// Alternatively add and subtract packed single-precision (32-bit) @@ -51,8 +51,8 @@ pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vaddsubps))] -pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 { - unsafe { addsubps256(a, b) } +pub unsafe fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 { + addsubps256(a, b) } /// Subtract packed double-precision (64-bit) floating-point elements in `b` @@ -60,7 +60,7 @@ pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vsubpd))] -pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 { +pub unsafe fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 { a - b } @@ -69,25 +69,24 @@ pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vsubps))] -pub fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 { +pub unsafe fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 { a - b } /// Round packed double-precision (64-bit) floating point elements in `a` /// according to the flag `b`. The value of `b` may be as follows: +/// +/// ```ignore /// 0x00: Round to the nearest whole number. /// 0x01: Round down, toward negative infinity. /// 0x02: Round up, toward positive infinity. /// 0x03: Truncate the values. -/// For a few additional values options, check the LLVM docs: -/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 +/// ``` #[inline(always)] #[target_feature = "+avx"] -pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 { +pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 { macro_rules! call { - ($imm8:expr) => { - unsafe { roundpd256(a, $imm8) } - } + ($imm8:expr) => { roundpd256(a, $imm8) } } constify_imm8!(b, call) } @@ -96,7 +95,7 @@ pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 { #[cfg_attr(test, assert_instr(vroundpd))] #[target_feature = "+avx"] fn test_mm256_round_pd(a: f64x4) -> f64x4 { - _mm256_round_pd(a, 0x3) + unsafe { _mm256_round_pd(a, 0x3) } } /// Round packed double-precision (64-bit) floating point elements in `a` toward @@ -104,8 +103,8 @@ fn test_mm256_round_pd(a: f64x4) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundpd))] -pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 { - unsafe { roundpd256(a, 0x02) } +pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 { + roundpd256(a, 0x02) } /// Round packed double-precision (64-bit) floating point elements in `a` toward @@ -113,8 +112,8 @@ pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vroundpd))] -pub fn _mm256_floor_pd(a: f64x4) -> f64x4 { - unsafe { roundpd256(a, 0x01) } +pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 { + roundpd256(a, 0x01) } /// LLVM intrinsics used in the above functions @@ -139,7 +138,7 @@ mod tests { fn _mm256_add_pd() { let a = f64x4::new(1.0, 2.0, 3.0, 4.0); let b = f64x4::new(5.0, 6.0, 7.0, 8.0); - let r = avx::_mm256_add_pd(a, b); + let r = unsafe { avx::_mm256_add_pd(a, b) }; let e = f64x4::new(6.0, 8.0, 10.0, 12.0); assert_eq!(r, e); } @@ -148,7 +147,7 @@ mod tests { fn _mm256_add_ps() { let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0); - let r = avx::_mm256_add_ps(a, b); + let r = unsafe { avx::_mm256_add_ps(a, b) }; let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0); assert_eq!(r, e); } @@ -157,7 +156,7 @@ mod tests { fn _mm256_mul_pd() { let a = f64x4::new(1.0, 2.0, 3.0, 4.0); let b = f64x4::new(5.0, 6.0, 7.0, 8.0); - let r = avx::_mm256_mul_pd(a, b); + let r = unsafe { avx::_mm256_mul_pd(a, b) }; let e = f64x4::new(5.0, 12.0, 21.0, 32.0); assert_eq!(r, e); } @@ -166,7 +165,7 @@ mod tests { fn _mm256_mul_ps() { let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0); - let r = avx::_mm256_mul_ps(a, b); + let r = unsafe { avx::_mm256_mul_ps(a, b) }; let e = f32x8::new(9.0, 20.0, 33.0, 48.0, 65.0, 84.0, 105.0, 128.0); assert_eq!(r, e); } @@ -175,7 +174,7 @@ mod tests { fn _mm256_addsub_pd() { let a = f64x4::new(1.0, 2.0, 3.0, 4.0); let b = f64x4::new(5.0, 6.0, 7.0, 8.0); - let r = avx::_mm256_addsub_pd(a, b); + let r = unsafe { avx::_mm256_addsub_pd(a, b) }; let e = f64x4::new(-4.0, 8.0, -4.0, 12.0); assert_eq!(r, e); } @@ -184,7 +183,7 @@ mod tests { fn _mm256_addsub_ps() { let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0); let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0); - let r = avx::_mm256_addsub_ps(a, b); + let r = unsafe { avx::_mm256_addsub_ps(a, b) }; let e = f32x8::new(-4.0, 8.0, -4.0, 12.0, -4.0, 8.0, -4.0, 12.0); assert_eq!(r, e); } @@ -193,7 +192,7 @@ mod tests { fn _mm256_sub_pd() { let a = f64x4::new(1.0, 2.0, 3.0, 4.0); let b = f64x4::new(5.0, 6.0, 7.0, 8.0); - let r = avx::_mm256_sub_pd(a, b); + let r = unsafe { avx::_mm256_sub_pd(a, b) }; let e = f64x4::new(-4.0,-4.0,-4.0,-4.0); assert_eq!(r, e); } @@ -202,7 +201,7 @@ mod tests { fn _mm256_sub_ps() { let a = f32x8::new(1.0, 2.0, 3.0, 4.0, -1.0, -2.0, -3.0, -4.0); let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 3.0, 2.0, 1.0, 0.0); - let r = avx::_mm256_sub_ps(a, b); + let r = unsafe { avx::_mm256_sub_ps(a, b) }; let e = f32x8::new(-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0); assert_eq!(r, e); } @@ -210,9 +209,9 @@ mod tests { #[simd_test = "avx"] fn _mm256_round_pd() { let a = f64x4::new(1.55, 2.2, 3.99, -1.2); - let result_closest = avx::_mm256_round_pd(a, 0b00000000); - let result_down = avx::_mm256_round_pd(a, 0b00000001); - let result_up = avx::_mm256_round_pd(a, 0b00000010); + let result_closest = unsafe { avx::_mm256_round_pd(a, 0b00000000) }; + let result_down = unsafe { avx::_mm256_round_pd(a, 0b00000001) }; + let result_up = unsafe { avx::_mm256_round_pd(a, 0b00000010) }; let expected_closest = f64x4::new(2.0, 2.0, 4.0, -1.0); let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0); let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0); @@ -224,7 +223,7 @@ mod tests { #[simd_test = "avx"] fn _mm256_floor_pd() { let a = f64x4::new(1.55, 2.2, 3.99, -1.2); - let result_down = avx::_mm256_floor_pd(a); + let result_down = unsafe { avx::_mm256_floor_pd(a) }; let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0); assert_eq!(result_down, expected_down); } @@ -232,7 +231,7 @@ mod tests { #[simd_test = "avx"] fn _mm256_ceil_pd() { let a = f64x4::new(1.55, 2.2, 3.99, -1.2); - let result_up = avx::_mm256_ceil_pd(a, ); + let result_up = unsafe { avx::_mm256_ceil_pd(a) }; let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0); assert_eq!(result_up, expected_up); } diff --git a/library/stdarch/src/x86/avx2.rs b/library/stdarch/src/x86/avx2.rs index 5cb85c161467..6e3629f0e6ce 100644 --- a/library/stdarch/src/x86/avx2.rs +++ b/library/stdarch/src/x86/avx2.rs @@ -9,31 +9,31 @@ use stdsimd_test::assert_instr; #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpabsd))] -pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 { - unsafe { pabsd(a) } +pub unsafe fn _mm256_abs_epi32(a: i32x8) -> i32x8 { + pabsd(a) } /// Computes the absolute values of packed 16-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpabsw))] -pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 { - unsafe { pabsw(a) } +pub unsafe fn _mm256_abs_epi16(a: i16x16) -> i16x16 { + pabsw(a) } /// Computes the absolute values of packed 8-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpabsb))] -pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 { - unsafe { pabsb(a) } +pub unsafe fn _mm256_abs_epi8(a: i8x32) -> i8x32 { + pabsb(a) } /// Add packed 64-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddq))] -pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { +pub unsafe fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { a + b } @@ -41,7 +41,7 @@ pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddd))] -pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { +pub unsafe fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { a + b } @@ -49,7 +49,7 @@ pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddw))] -pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { +pub unsafe fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { a + b } @@ -57,7 +57,7 @@ pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddb))] -pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { +pub unsafe fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { a + b } @@ -65,32 +65,32 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddsb))] -pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { paddsb(a, b) } +pub unsafe fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { + paddsb(a, b) } /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddsw))] -pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { paddsw(a, b) } +pub unsafe fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { + paddsw(a, b) } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddusb))] -pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { - unsafe { paddusb(a, b) } +pub unsafe fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { + paddusb(a, b) } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpaddusw))] -pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { paddusw(a, b) } +pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { + paddusw(a, b) } // TODO _mm256_alignr_epi8 @@ -100,7 +100,7 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vandps))] -pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { +pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { a & b } @@ -109,7 +109,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vandnps))] -pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { +pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { (!a) & b } @@ -117,16 +117,16 @@ pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpavgw))] -pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { - unsafe { pavgw(a, b) } +pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { + pavgw(a, b) } /// Average packed unsigned 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpavgb))] -pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { - unsafe { pavgb(a, b) } +pub unsafe fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { + pavgb(a, b) } // TODO _mm256_blend_epi16 @@ -137,8 +137,8 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpblendvb))] -pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { - unsafe { pblendvb(a,b,mask) } +pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { + pblendvb(a,b,mask) } // TODO _mm_broadcastb_epi8 @@ -158,12 +158,11 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { // TODO _mm256_bslli_epi128 // TODO _mm256_bsrli_epi128 - /// Compare packed 64-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpeqq))] -pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { +pub unsafe fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { a.eq(b) } @@ -171,7 +170,7 @@ pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpeqd))] -pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { +pub unsafe fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { a.eq(b) } @@ -179,7 +178,7 @@ pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpeqw))] -pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { +pub unsafe fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { a.eq(b) } @@ -187,7 +186,7 @@ pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpeqb))] -pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { +pub unsafe fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { a.eq(b) } @@ -195,7 +194,7 @@ pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpgtq))] -pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { +pub unsafe fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { a.gt(b) } @@ -203,7 +202,7 @@ pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpgtd))] -pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { +pub unsafe fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { a.gt(b) } @@ -211,7 +210,7 @@ pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpgtw))] -pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { +pub unsafe fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { a.gt(b) } @@ -219,7 +218,7 @@ pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpcmpgtb))] -pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { +pub unsafe fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { a.gt(b) } @@ -241,16 +240,16 @@ pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vphaddw))] -pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { phaddw(a, b) } +pub unsafe fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 { + phaddw(a, b) } /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vphaddd))] -pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { - unsafe { phaddd(a, b) } +pub unsafe fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { + phaddd(a, b) } /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b` @@ -258,24 +257,24 @@ pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vphaddsw))] -pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { phaddsw(a, b) } +pub unsafe fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 { + phaddsw(a, b) } /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vphsubw))] -pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { phsubw(a, b) } +pub unsafe fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 { + phsubw(a, b) } /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vphsubd))] -pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { - unsafe { phsubd(a, b) } +pub unsafe fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { + phsubd(a, b) } /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` @@ -283,8 +282,8 @@ pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vphsubsw))] -pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { phsubsw(a, b) } +pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { + phsubsw(a, b) } @@ -328,8 +327,8 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { - unsafe { pmaddwd(a, b) } +pub unsafe fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { + pmaddwd(a, b) } /// Vertically multiply each unsigned 8-bit integer from `a` with the @@ -339,8 +338,8 @@ pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaddubsw))] -pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { - unsafe { pmaddubsw(a, b) } +pub unsafe fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { + pmaddubsw(a, b) } // TODO _mm_maskload_epi32 (int const* mem_addr, __m128i mask) @@ -357,8 +356,8 @@ pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaxsw))] -pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { pmaxsw(a, b) } +pub unsafe fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { + pmaxsw(a, b) } /// Compare packed 32-bit integers in `a` and `b`, and return the packed @@ -366,8 +365,8 @@ pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaxsd))] -pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { - unsafe { pmaxsd(a, b) } +pub unsafe fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { + pmaxsd(a, b) } /// Compare packed 8-bit integers in `a` and `b`, and return the packed @@ -375,8 +374,8 @@ pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaxsb))] -pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { pmaxsb(a, b) } +pub unsafe fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { + pmaxsb(a, b) } /// Compare packed unsigned 16-bit integers in `a` and `b`, and return @@ -384,8 +383,8 @@ pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaxuw))] -pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { pmaxuw(a, b) } +pub unsafe fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { + pmaxuw(a, b) } /// Compare packed unsigned 32-bit integers in `a` and `b`, and return @@ -393,8 +392,8 @@ pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaxud))] -pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { - unsafe { pmaxud(a, b) } +pub unsafe fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { + pmaxud(a, b) } /// Compare packed unsigned 8-bit integers in `a` and `b`, and return @@ -402,8 +401,8 @@ pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmaxub))] -pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { - unsafe { pmaxub(a, b) } +pub unsafe fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { + pmaxub(a, b) } /// Compare packed 16-bit integers in `a` and `b`, and return the packed @@ -411,8 +410,8 @@ pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpminsw))] -pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { pminsw(a, b) } +pub unsafe fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { + pminsw(a, b) } /// Compare packed 32-bit integers in `a` and `b`, and return the packed @@ -420,8 +419,8 @@ pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpminsd))] -pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { - unsafe { pminsd(a, b) } +pub unsafe fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { + pminsd(a, b) } /// Compare packed 8-bit integers in `a` and `b`, and return the packed @@ -429,8 +428,8 @@ pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpminsb))] -pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { pminsb(a, b) } +pub unsafe fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { + pminsb(a, b) } /// Compare packed unsigned 16-bit integers in `a` and `b`, and return @@ -438,8 +437,8 @@ pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpminuw))] -pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { pminuw(a, b) } +pub unsafe fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { + pminuw(a, b) } /// Compare packed unsigned 32-bit integers in `a` and `b`, and return @@ -447,8 +446,8 @@ pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpminud))] -pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { - unsafe { pminud(a, b) } +pub unsafe fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { + pminud(a, b) } /// Compare packed unsigned 8-bit integers in `a` and `b`, and return @@ -456,8 +455,8 @@ pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpminub))] -pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { - unsafe { pminub(a, b) } +pub unsafe fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { + pminub(a, b) } /*** The following two functions fail in debug, but work in release @@ -492,8 +491,8 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmuldq))] -pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { - unsafe { pmuldq(a, b) } +pub unsafe fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { + pmuldq(a, b) } /// Multiply the low unsigned 32-bit integers from each packed 64-bit @@ -503,8 +502,8 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmuludq))] -pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { - unsafe { pmuludq(a, b) } +pub unsafe fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { + pmuludq(a, b) } /// Multiply the packed 16-bit integers in `a` and `b`, producing @@ -513,8 +512,8 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmulhw))] -pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { pmulhw(a, b) } +pub unsafe fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { + pmulhw(a, b) } /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing @@ -523,8 +522,8 @@ pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmulhuw))] -pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { pmulhuw(a, b) } +pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { + pmulhuw(a, b) } /// Multiply the packed 16-bit integers in `a` and `b`, producing @@ -533,7 +532,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmullw))] -pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { +pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { a * b } @@ -544,7 +543,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmulld))] -pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { +pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { a * b } @@ -555,8 +554,8 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpmulhrsw))] -pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { - unsafe { pmulhrsw(a, b) } +pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { + pmulhrsw(a, b) } /// Compute the bitwise OR of 256 bits (representing integer data) in `a` @@ -564,7 +563,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vorps))] -pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { +pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { a | b } @@ -573,8 +572,8 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpacksswb))] -pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { - unsafe { packsswb(a, b) } +pub unsafe fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { + packsswb(a, b) } /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -582,8 +581,8 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpackssdw))] -pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { - unsafe { packssdw(a, b) } +pub unsafe fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { + packssdw(a, b) } /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers @@ -591,8 +590,8 @@ pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpackuswb))] -pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { - unsafe { packuswb(a, b) } +pub unsafe fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { + packuswb(a, b) } /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -600,8 +599,8 @@ pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpackusdw))] -pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { - unsafe { packusdw(a, b) } +pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { + packusdw(a, b) } // TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) @@ -617,8 +616,8 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsadbw))] -pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { - unsafe { psadbw(a, b) } +pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { + psadbw(a, b) } // TODO _mm256_shuffle_epi32 (__m256i a, const int imm8) @@ -632,8 +631,8 @@ pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsignw))] -pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { psignw(a, b) } +pub unsafe fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { + psignw(a, b) } /// Negate packed 32-bit integers in `a` when the corresponding signed @@ -642,8 +641,8 @@ pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsignd))] -pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { - unsafe { psignd(a, b) } +pub unsafe fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { + psignd(a, b) } /// Negate packed 8-bit integers in `a` when the corresponding signed @@ -652,8 +651,8 @@ pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsignb))] -pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { psignb(a, b) } +pub unsafe fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { + psignb(a, b) } /// Shift packed 16-bit integers in `a` left by `count` while @@ -661,8 +660,8 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsllw))] -pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 { - unsafe { psllw(a, count) } +pub unsafe fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 { + psllw(a, count) } /// Shift packed 32-bit integers in `a` left by `count` while @@ -670,8 +669,8 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpslld))] -pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 { - unsafe { pslld(a, count) } +pub unsafe fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 { + pslld(a, count) } /// Shift packed 64-bit integers in `a` left by `count` while @@ -679,35 +678,35 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsllq))] -pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 { - unsafe { psllq(a, count) } +pub unsafe fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 { + psllq(a, count) } /// Shift packed 16-bit integers in `a` left by `imm8` while /// shifting in zeros, return the results; #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsllw))] // TODO: should this be pslli -pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 { - unsafe { pslliw(a, imm8) } +#[cfg_attr(test, assert_instr(vpsllw))] +pub unsafe fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 { + pslliw(a, imm8) } /// Shift packed 32-bit integers in `a` left by `imm8` while /// shifting in zeros, return the results; #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpslld))] // TODO: should this be pslli -pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 { - unsafe { psllid(a, imm8) } +#[cfg_attr(test, assert_instr(vpslld))] +pub unsafe fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 { + psllid(a, imm8) } /// Shift packed 64-bit integers in `a` left by `imm8` while /// shifting in zeros, return the results; #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsllq))] // TODO: should this be pslli -pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 { - unsafe { pslliq(a, imm8) } +#[cfg_attr(test, assert_instr(vpsllq))] +pub unsafe fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 { + pslliq(a, imm8) } // TODO _mm256_slli_si256 (__m256i a, const int imm8) @@ -718,8 +717,8 @@ pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsllvd))] -pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 { - unsafe { psllvd(a, count) } +pub unsafe fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 { + psllvd(a, count) } /// Shift packed 32-bit integers in `a` left by the amount @@ -728,8 +727,8 @@ pub fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsllvd))] -pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 { - unsafe { psllvd256(a, count) } +pub unsafe fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 { + psllvd256(a, count) } /// Shift packed 64-bit integers in `a` left by the amount @@ -738,8 +737,8 @@ pub fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsllvq))] -pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 { - unsafe { psllvq(a, count) } +pub unsafe fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 { + psllvq(a, count) } /// Shift packed 64-bit integers in `a` left by the amount @@ -748,8 +747,8 @@ pub fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsllvq))] -pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 { - unsafe { psllvq256(a, count) } +pub unsafe fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 { + psllvq256(a, count) } /// Shift packed 16-bit integers in `a` right by `count` while @@ -757,8 +756,8 @@ pub fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsraw))] -pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 { - unsafe { psraw(a, count) } +pub unsafe fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 { + psraw(a, count) } /// Shift packed 32-bit integers in `a` right by `count` while @@ -766,26 +765,26 @@ pub fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrad))] -pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 { - unsafe { psrad(a, count) } +pub unsafe fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 { + psrad(a, count) } /// Shift packed 16-bit integers in `a` right by `imm8` while /// shifting in sign bits. #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsraw))] // TODO: notvpsraiw? -pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 { - unsafe { psraiw(a, imm8) } +#[cfg_attr(test, assert_instr(vpsraw))] +pub unsafe fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 { + psraiw(a, imm8) } /// Shift packed 32-bit integers in `a` right by `imm8` while /// shifting in sign bits. #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsrad))] // TODO: not vpsraid? -pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 { - unsafe { psraid(a, imm8) } +#[cfg_attr(test, assert_instr(vpsrad))] +pub unsafe fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 { + psraid(a, imm8) } /// Shift packed 32-bit integers in `a` right by the amount specified by the @@ -793,8 +792,8 @@ pub fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsravd))] -pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 { - unsafe { psravd(a, count) } +pub unsafe fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 { + psravd(a, count) } /// Shift packed 32-bit integers in `a` right by the amount specified by the @@ -802,8 +801,8 @@ pub fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsravd))] -pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 { - unsafe { psravd256(a, count) } +pub unsafe fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 { + psravd256(a, count) } @@ -812,8 +811,8 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrlw))] -pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { - unsafe { psrlw(a, count) } +pub unsafe fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { + psrlw(a, count) } /// Shift packed 32-bit integers in `a` right by `count` while shifting in @@ -821,8 +820,8 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrld))] -pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { - unsafe { psrld(a, count) } +pub unsafe fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { + psrld(a, count) } /// Shift packed 64-bit integers in `a` right by `count` while shifting in @@ -830,35 +829,35 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrlq))] -pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 { - unsafe { psrlq(a, count) } +pub unsafe fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 { + psrlq(a, count) } /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in /// zeros #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsrlw))] // TODO not vpsrliw? -pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 { - unsafe { psrliw(a, imm8) } +#[cfg_attr(test, assert_instr(vpsrlw))] +pub unsafe fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 { + psrliw(a, imm8) } /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in /// zeros #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsrld))] // TODO: not vpsrlid? -pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 { - unsafe { psrlid(a, imm8) } +#[cfg_attr(test, assert_instr(vpsrld))] +pub unsafe fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 { + psrlid(a, imm8) } /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in /// zeros #[inline(always)] #[target_feature = "+avx2"] -#[cfg_attr(test, assert_instr(vpsrlq))] // TODO: not vpsrliq? -pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 { - unsafe { psrliq(a, imm8) } +#[cfg_attr(test, assert_instr(vpsrlq))] +pub unsafe fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 { + psrliq(a, imm8) } /// Shift packed 32-bit integers in `a` right by the amount specified by @@ -866,8 +865,8 @@ pub fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrlvd))] -pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 { - unsafe { psrlvd(a, count) } +pub unsafe fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 { + psrlvd(a, count) } /// Shift packed 32-bit integers in `a` right by the amount specified by @@ -875,8 +874,8 @@ pub fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrlvd))] -pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 { - unsafe { psrlvd256(a, count) } +pub unsafe fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 { + psrlvd256(a, count) } /// Shift packed 64-bit integers in `a` right by the amount specified by @@ -884,8 +883,8 @@ pub fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrlvq))] -pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 { - unsafe { psrlvq(a, count) } +pub unsafe fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 { + psrlvq(a, count) } /// Shift packed 64-bit integers in `a` right by the amount specified by @@ -893,8 +892,8 @@ pub fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsrlvq))] -pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 { - unsafe { psrlvq256(a, count) } +pub unsafe fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 { + psrlvq256(a, count) } // TODO _mm256_stream_load_si256 (__m256i const* mem_addr) @@ -903,7 +902,7 @@ pub fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubw))] -pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 { +pub unsafe fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 { a - b } @@ -911,7 +910,7 @@ pub fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubd))] -pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 { +pub unsafe fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 { a - b } @@ -919,7 +918,7 @@ pub fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubq))] -pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 { +pub unsafe fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 { a - b } @@ -927,7 +926,7 @@ pub fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubb))] -pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 { +pub unsafe fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 { a - b } @@ -936,8 +935,8 @@ pub fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubsw))] -pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { psubsw(a, b) } +pub unsafe fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 { + psubsw(a, b) } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in @@ -945,8 +944,8 @@ pub fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubsb))] -pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { psubsb(a, b) } +pub unsafe fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 { + psubsb(a, b) } /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit @@ -954,8 +953,8 @@ pub fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubusw))] -pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { psubusw(a, b) } +pub unsafe fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 { + psubusw(a, b) } /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit @@ -963,8 +962,8 @@ pub fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpsubusb))] -pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { - unsafe { psubusb(a, b) } +pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { + psubusb(a, b) } // TODO __mm256_unpackhi_epi16 (__m256i a, __m256i b) @@ -981,11 +980,10 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vxorps))] -pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { +pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { a ^ b } - #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -1048,9 +1046,9 @@ extern "C" { fn pminud(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.pminu.b"] fn pminub(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pmovmskb"] //fails in debug + #[link_name = "llvm.x86.avx2.pmovmskb"] fn pmovmskb(a: i8x32) -> i32; - #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug + #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; #[link_name = "llvm.x86.avx2.pmulhu.w"] fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; @@ -1141,7 +1139,6 @@ extern "C" { } - #[cfg(test)] mod tests { use stdsimd_test::simd_test; @@ -1157,7 +1154,7 @@ mod tests { let a = i32x8::new( 0, 1, -1, std::i32::MAX, std::i32::MIN + 1, 100, -100, -32); - let r = avx2::_mm256_abs_epi32(a); + let r = unsafe { avx2::_mm256_abs_epi32(a) }; let e = i32x8::new( 0, 1, 1, std::i32::MAX, (std::i32::MIN + 1).abs(), 100, 100, 32); @@ -1171,7 +1168,7 @@ mod tests { -2, 3, -3, 4, -4, 5, -5, std::i16::MAX, std::i16::MIN + 1, 100, -100, -32); - let r = avx2::_mm256_abs_epi16(a); + let r = unsafe { avx2::_mm256_abs_epi16(a) }; let e = i16x16::new( 0, 1, 1, 2, 2, 3, 3, 4, @@ -1191,7 +1188,7 @@ mod tests { -2, 3, -3, 4, -4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32); - let r = avx2::_mm256_abs_epi8(a); + let r = unsafe { avx2::_mm256_abs_epi8(a) }; let e = i8x32::new( 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, @@ -1204,7 +1201,7 @@ mod tests { fn _mm256_add_epi64() { let a = i64x4::new(-10, 0, 100, 1_000_000_000); let b = i64x4::new(-1, 0, 1, 2); - let r = avx2::_mm256_add_epi64(a, b); + let r = unsafe { avx2::_mm256_add_epi64(a, b) }; let e = i64x4::new(-11, 0, 101, 1_000_000_002); assert_eq!(r, e); } @@ -1213,7 +1210,7 @@ mod tests { fn _mm256_add_epi32() { let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_add_epi32(a, b); + let r = unsafe { avx2::_mm256_add_epi32(a, b) }; let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14); assert_eq!(r, e); } @@ -1226,7 +1223,7 @@ mod tests { let b = i16x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = avx2::_mm256_add_epi16(a, b); + let r = unsafe { avx2::_mm256_add_epi16(a, b) }; let e = i16x16::new( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); @@ -1245,7 +1242,7 @@ mod tests { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - let r = avx2::_mm256_add_epi8(a, b); + let r = unsafe { avx2::_mm256_add_epi8(a, b) }; let e = i8x32::new( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, @@ -1262,7 +1259,7 @@ mod tests { let b = i8x32::new( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); - let r = avx2::_mm256_adds_epi8(a, b); + let r = unsafe { avx2::_mm256_adds_epi8(a, b) }; let e = i8x32::new( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); @@ -1273,7 +1270,7 @@ mod tests { fn _mm256_adds_epi8_saturate_positive() { let a = i8x32::splat(0x7F); let b = i8x32::splat(1); - let r = avx2::_mm256_adds_epi8(a, b); + let r = unsafe { avx2::_mm256_adds_epi8(a, b) }; assert_eq!(r, a); } @@ -1281,7 +1278,7 @@ mod tests { fn _mm256_adds_epi8_saturate_negative() { let a = i8x32::splat(-0x80); let b = i8x32::splat(-1); - let r = avx2::_mm256_adds_epi8(a, b); + let r = unsafe { avx2::_mm256_adds_epi8(a, b) }; assert_eq!(r, a); } @@ -1291,7 +1288,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); - let r = avx2::_mm256_adds_epi16(a, b); + let r = unsafe { avx2::_mm256_adds_epi16(a, b) }; let e = i16x16::new( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); @@ -1302,7 +1299,7 @@ mod tests { fn _mm256_adds_epi16_saturate_positive() { let a = i16x16::splat(0x7FFF); let b = i16x16::splat(1); - let r = avx2::_mm256_adds_epi16(a, b); + let r = unsafe { avx2::_mm256_adds_epi16(a, b) }; assert_eq!(r, a); } @@ -1310,7 +1307,7 @@ mod tests { fn _mm256_adds_epi16_saturate_negative() { let a = i16x16::splat(-0x8000); let b = i16x16::splat(-1); - let r = avx2::_mm256_adds_epi16(a, b); + let r = unsafe { avx2::_mm256_adds_epi16(a, b) }; assert_eq!(r, a); } @@ -1322,7 +1319,7 @@ mod tests { let b = u8x32::new( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); - let r = avx2::_mm256_adds_epu8(a, b); + let r = unsafe { avx2::_mm256_adds_epu8(a, b) }; let e = u8x32::new( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); @@ -1333,7 +1330,7 @@ mod tests { fn _mm256_adds_epu8_saturate() { let a = u8x32::splat(0xFF); let b = u8x32::splat(1); - let r = avx2::_mm256_adds_epu8(a, b); + let r = unsafe { avx2::_mm256_adds_epu8(a, b) }; assert_eq!(r, a); } @@ -1344,7 +1341,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = u16x16::new( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); - let r = avx2::_mm256_adds_epu16(a, b); + let r = unsafe { avx2::_mm256_adds_epu16(a, b) }; let e = u16x16::new( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); @@ -1355,35 +1352,37 @@ mod tests { fn _mm256_adds_epu16_saturate() { let a = u16x16::splat(0xFFFF); let b = u16x16::splat(1); - let r = avx2::_mm256_adds_epu16(a, b); + let r = unsafe { avx2::_mm256_adds_epu16(a, b) }; assert_eq!(r, a); } #[simd_test = "avx2"] fn _mm256_and_si256() { - assert_eq!( - avx2::_mm256_and_si256( - __m256i::splat(5), __m256i::splat(3)),__m256i::splat(1)); + let got = unsafe { + avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)) + }; + assert_eq!(got, __m256i::splat(1)); } #[simd_test = "avx2"] fn _mm256_andnot_si256() { - assert_eq!( - avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)), - __m256i::splat(2)); + let got = unsafe { + avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)) + }; + assert_eq!(got, __m256i::splat(2)); } #[simd_test = "avx2"] fn _mm256_avg_epu8() { let (a, b) = (u8x32::splat(3), u8x32::splat(9)); - let r = avx2::_mm256_avg_epu8(a, b); + let r = unsafe { avx2::_mm256_avg_epu8(a, b) }; assert_eq!(r, u8x32::splat(6)); } #[simd_test = "avx2"] fn _mm256_avg_epu16() { let (a, b) = (u16x16::splat(3), u16x16::splat(9)); - let r = avx2::_mm256_avg_epu16(a, b); + let r = unsafe { avx2::_mm256_avg_epu16(a, b) }; assert_eq!(r, u16x16::splat(6)); } @@ -1392,7 +1391,7 @@ mod tests { let (a,b) = (i8x32::splat(4),i8x32::splat(2)); let mask = i8x32::splat(0).replace(2,-1); let e = i8x32::splat(4).replace(2,2); - let r= avx2::_mm256_blendv_epi8(a,b,mask); + let r= unsafe { avx2::_mm256_blendv_epi8(a,b,mask) }; assert_eq!(r,e); } @@ -1404,7 +1403,7 @@ mod tests { let b = i8x32::new( 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let r = avx2::_mm256_cmpeq_epi8(a, b); + let r = unsafe { avx2::_mm256_cmpeq_epi8(a, b) }; assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); } @@ -1414,7 +1413,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new( 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let r = avx2::_mm256_cmpeq_epi16(a, b); + let r = unsafe { avx2::_mm256_cmpeq_epi16(a, b) }; assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); } @@ -1422,7 +1421,7 @@ mod tests { fn _mm256_cmpeq_epi32() { let a = i32x8::new(0, 1, 2, 3,4,5,6,7); let b = i32x8::new(7,6,2,4,3, 2, 1, 0); - let r = avx2::_mm256_cmpeq_epi32(a, b); + let r = unsafe { avx2::_mm256_cmpeq_epi32(a, b) }; assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); } @@ -1430,7 +1429,7 @@ mod tests { fn _mm256_cmpeq_epi64() { let a = i64x4::new(0, 1, 2, 3); let b = i64x4::new(3, 2, 2, 0); - let r = avx2::_mm256_cmpeq_epi64(a, b); + let r = unsafe { avx2::_mm256_cmpeq_epi64(a, b) }; assert_eq!(r, i64x4::splat(0).replace( 2, 0xFFFFFFFFFFFFFFFFu64 as i64)); } @@ -1439,7 +1438,7 @@ mod tests { fn _mm256_cmpgt_epi8() { let a = i8x32::splat(0).replace(0, 5); let b = i8x32::splat(0); - let r = avx2::_mm256_cmpgt_epi8(a, b); + let r = unsafe { avx2::_mm256_cmpgt_epi8(a, b) }; assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8)); } @@ -1447,7 +1446,7 @@ mod tests { fn _mm256_cmpgt_epi16() { let a = i16x16::splat(0).replace(0, 5); let b = i16x16::splat(0); - let r = avx2::_mm256_cmpgt_epi16(a, b); + let r = unsafe { avx2::_mm256_cmpgt_epi16(a, b) }; assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16)); } @@ -1455,7 +1454,7 @@ mod tests { fn _mm256_cmpgt_epi32() { let a = i32x8::splat(0).replace(0, 5); let b = i32x8::splat(0); - let r = avx2::_mm256_cmpgt_epi32(a, b); + let r = unsafe { avx2::_mm256_cmpgt_epi32(a, b) }; assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); } @@ -1463,7 +1462,7 @@ mod tests { fn _mm256_cmpgt_epi64() { let a = i64x4::splat(0).replace(0, 5); let b = i64x4::splat(0); - let r = avx2::_mm256_cmpgt_epi64(a, b); + let r = unsafe { avx2::_mm256_cmpgt_epi64(a, b) }; assert_eq!(r, i64x4::splat(0).replace( 0, 0xFFFFFFFFFFFFFFFFu64 as i64)); } @@ -1472,7 +1471,7 @@ mod tests { fn _mm256_hadd_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_hadd_epi16(a, b); + let r = unsafe { avx2::_mm256_hadd_epi16(a, b) }; let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); assert_eq!(r, e); } @@ -1481,7 +1480,7 @@ mod tests { fn _mm256_hadd_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_hadd_epi32(a, b); + let r = unsafe { avx2::_mm256_hadd_epi32(a, b) }; let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); assert_eq!(r, e); } @@ -1490,7 +1489,7 @@ mod tests { fn _mm256_hadds_epi16() { let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1); let b = i16x16::splat(4); - let r = avx2::_mm256_hadds_epi16(a, b); + let r = unsafe { avx2::_mm256_hadds_epi16(a, b) }; let e = i16x16::new( 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); assert_eq!(r, e); @@ -1500,7 +1499,7 @@ mod tests { fn _mm256_hsub_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_hsub_epi16(a, b); + let r = unsafe { avx2::_mm256_hsub_epi16(a, b) }; let e = i16x16::splat(0); assert_eq!(r, e); } @@ -1509,7 +1508,7 @@ mod tests { fn _mm256_hsub_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_hsub_epi32(a, b); + let r = unsafe { avx2::_mm256_hsub_epi32(a, b) }; let e = i32x8::splat(0); assert_eq!(r, e); } @@ -1518,7 +1517,7 @@ mod tests { fn _mm256_hsubs_epi16() { let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1); let b = i16x16::splat(4); - let r = avx2::_mm256_hsubs_epi16(a, b); + let r = unsafe { avx2::_mm256_hsubs_epi16(a, b) }; let e = i16x16::splat(0).replace(0,0x7FFF); assert_eq!(r, e); } @@ -1527,7 +1526,7 @@ mod tests { fn _mm256_madd_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_madd_epi16(a, b); + let r = unsafe { avx2::_mm256_madd_epi16(a, b) }; let e = i32x8::splat(16); assert_eq!(r, e); } @@ -1536,7 +1535,7 @@ mod tests { fn _mm256_maddubs_epi16() { let a = u8x32::splat(2); let b = u8x32::splat(4); - let r = avx2::_mm256_maddubs_epi16(a, b); + let r = unsafe { avx2::_mm256_maddubs_epi16(a, b) }; let e = i16x16::splat(16); assert_eq!(r, e); } @@ -1545,7 +1544,7 @@ mod tests { fn _mm256_max_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_max_epi16(a, b); + let r = unsafe { avx2::_mm256_max_epi16(a, b) }; assert_eq!(r, b); } @@ -1553,7 +1552,7 @@ mod tests { fn _mm256_max_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_max_epi32(a, b); + let r = unsafe { avx2::_mm256_max_epi32(a, b) }; assert_eq!(r, b); } @@ -1561,7 +1560,7 @@ mod tests { fn _mm256_max_epi8() { let a = i8x32::splat(2); let b = i8x32::splat(4); - let r = avx2::_mm256_max_epi8(a, b); + let r = unsafe { avx2::_mm256_max_epi8(a, b) }; assert_eq!(r, b); } @@ -1569,7 +1568,7 @@ mod tests { fn _mm256_max_epu16() { let a = u16x16::splat(2); let b = u16x16::splat(4); - let r = avx2::_mm256_max_epu16(a, b); + let r = unsafe { avx2::_mm256_max_epu16(a, b) }; assert_eq!(r, b); } @@ -1577,7 +1576,7 @@ mod tests { fn _mm256_max_epu32() { let a = u32x8::splat(2); let b = u32x8::splat(4); - let r = avx2::_mm256_max_epu32(a, b); + let r = unsafe { avx2::_mm256_max_epu32(a, b) }; assert_eq!(r, b); } @@ -1585,7 +1584,7 @@ mod tests { fn _mm256_max_epu8() { let a = u8x32::splat(2); let b = u8x32::splat(4); - let r = avx2::_mm256_max_epu8(a, b); + let r = unsafe { avx2::_mm256_max_epu8(a, b) }; assert_eq!(r, b); } @@ -1593,7 +1592,7 @@ mod tests { fn _mm256_min_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_min_epi16(a, b); + let r = unsafe { avx2::_mm256_min_epi16(a, b) }; assert_eq!(r, a); } @@ -1601,7 +1600,7 @@ mod tests { fn _mm256_min_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_min_epi32(a, b); + let r = unsafe { avx2::_mm256_min_epi32(a, b) }; assert_eq!(r, a); } @@ -1609,7 +1608,7 @@ mod tests { fn _mm256_min_epi8() { let a = i8x32::splat(2); let b = i8x32::splat(4); - let r = avx2::_mm256_min_epi8(a, b); + let r = unsafe { avx2::_mm256_min_epi8(a, b) }; assert_eq!(r, a); } @@ -1617,7 +1616,7 @@ mod tests { fn _mm256_min_epu16() { let a = u16x16::splat(2); let b = u16x16::splat(4); - let r = avx2::_mm256_min_epu16(a, b); + let r = unsafe { avx2::_mm256_min_epu16(a, b) }; assert_eq!(r, a); } @@ -1625,7 +1624,7 @@ mod tests { fn _mm256_min_epu32() { let a = u32x8::splat(2); let b = u32x8::splat(4); - let r = avx2::_mm256_min_epu32(a, b); + let r = unsafe { avx2::_mm256_min_epu32(a, b) }; assert_eq!(r, a); } @@ -1633,7 +1632,7 @@ mod tests { fn _mm256_min_epu8() { let a = u8x32::splat(2); let b = u8x32::splat(4); - let r = avx2::_mm256_min_epu8(a, b); + let r = unsafe { avx2::_mm256_min_epu8(a, b) }; assert_eq!(r, a); } @@ -1665,7 +1664,7 @@ mod tests { fn _mm256_mul_epi32() { let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_mul_epi32(a, b); + let r = unsafe { avx2::_mm256_mul_epi32(a, b) }; let e = i64x4::new(0, 0, 10, 14); assert_eq!(r, e); } @@ -1674,7 +1673,7 @@ mod tests { fn _mm256_mul_epu32() { let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2); let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_mul_epu32(a, b); + let r = unsafe { avx2::_mm256_mul_epu32(a, b) }; let e = u64x4::new(0, 0, 10, 14); assert_eq!(r, e); } @@ -1683,7 +1682,7 @@ mod tests { fn _mm256_mulhi_epi16() { let a = i16x16::splat(6535); let b = i16x16::splat(6535); - let r = avx2::_mm256_mulhi_epi16(a, b); + let r = unsafe { avx2::_mm256_mulhi_epi16(a, b) }; let e = i16x16::splat(651); assert_eq!(r, e); } @@ -1692,7 +1691,7 @@ mod tests { fn _mm256_mulhi_epu16() { let a = u16x16::splat(6535); let b = u16x16::splat(6535); - let r = avx2::_mm256_mulhi_epu16(a, b); + let r = unsafe { avx2::_mm256_mulhi_epu16(a, b) }; let e = u16x16::splat(651); assert_eq!(r, e); } @@ -1701,7 +1700,7 @@ mod tests { fn _mm256_mullo_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_mullo_epi16(a, b); + let r = unsafe { avx2::_mm256_mullo_epi16(a, b) }; let e = i16x16::splat(8); assert_eq!(r, e); } @@ -1710,7 +1709,7 @@ mod tests { fn _mm256_mullo_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_mullo_epi32(a, b); + let r = unsafe { avx2::_mm256_mullo_epi32(a, b) }; let e = i32x8::splat(8); assert_eq!(r, e); } @@ -1719,7 +1718,7 @@ mod tests { fn _mm256_mulhrs_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_mullo_epi16(a, b); + let r = unsafe { avx2::_mm256_mullo_epi16(a, b) }; let e = i16x16::splat(8); assert_eq!(r, e); } @@ -1728,7 +1727,7 @@ mod tests { fn _mm256_or_si256() { let a = __m256i::splat(-1); let b = __m256i::splat(0); - let r = avx2::_mm256_or_si256(a, b); + let r = unsafe { avx2::_mm256_or_si256(a, b) }; assert_eq!(r, a); } @@ -1736,7 +1735,7 @@ mod tests { fn _mm256_packs_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_packs_epi16(a, b); + let r = unsafe { avx2::_mm256_packs_epi16(a, b) }; let e = i8x32::new( 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, @@ -1750,7 +1749,7 @@ mod tests { fn _mm256_packs_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_packs_epi32(a, b); + let r = unsafe { avx2::_mm256_packs_epi32(a, b) }; let e = i16x16::new( 2, 2, 2, 2, 4, 4, 4, 4, @@ -1764,7 +1763,7 @@ mod tests { fn _mm256_packus_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(4); - let r = avx2::_mm256_packus_epi16(a, b); + let r = unsafe { avx2::_mm256_packus_epi16(a, b) }; let e = u8x32::new( 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, @@ -1778,7 +1777,7 @@ mod tests { fn _mm256_packus_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(4); - let r = avx2::_mm256_packus_epi32(a, b); + let r = unsafe { avx2::_mm256_packus_epi32(a, b) }; let e = u16x16::new( 2, 2, 2, 2, 4, 4, 4, 4, @@ -1792,7 +1791,7 @@ mod tests { fn _mm256_sad_epu8() { let a = u8x32::splat(2); let b = u8x32::splat(4); - let r = avx2::_mm256_sad_epu8(a, b); + let r = unsafe { avx2::_mm256_sad_epu8(a, b) }; let e = u64x4::splat(16); assert_eq!(r, e); } @@ -1801,7 +1800,7 @@ mod tests { fn _mm256_sign_epi16() { let a = i16x16::splat(2); let b = i16x16::splat(-1); - let r = avx2::_mm256_sign_epi16(a, b); + let r = unsafe { avx2::_mm256_sign_epi16(a, b) }; let e = i16x16::splat(-2); assert_eq!(r, e); } @@ -1810,7 +1809,7 @@ mod tests { fn _mm256_sign_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(-1); - let r = avx2::_mm256_sign_epi32(a, b); + let r = unsafe { avx2::_mm256_sign_epi32(a, b) }; let e = i32x8::splat(-2); assert_eq!(r, e); } @@ -1819,53 +1818,53 @@ mod tests { fn _mm256_sign_epi8() { let a = i8x32::splat(2); let b = i8x32::splat(-1); - let r = avx2::_mm256_sign_epi8(a, b); + let r = unsafe { avx2::_mm256_sign_epi8(a, b) }; let e = i8x32::splat(-2); assert_eq!(r, e); } #[simd_test = "avx2"] fn _mm256_sll_epi16() { - assert_eq!( - avx2::_mm256_sll_epi16(i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)), - i16x16::splat(0xFF0)); - + let a = i16x16::splat(0xFF); + let b = i16x8::splat(0).replace(0, 4); + let r = unsafe { avx2::_mm256_sll_epi16(a, b) }; + assert_eq!(r, i16x16::splat(0xFF0)); } #[simd_test = "avx2"] fn _mm256_sll_epi32() { - assert_eq!( - avx2::_mm256_sll_epi32(i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)), - i32x8::splat(0xFFFF0)); - + let a = i32x8::splat(0xFFFF); + let b = i32x4::splat(0).replace(0, 4); + let r = unsafe { avx2::_mm256_sll_epi32(a, b) }; + assert_eq!(r, i32x8::splat(0xFFFF0)); } #[simd_test = "avx2"] fn _mm256_sll_epi64() { - assert_eq!( - avx2::_mm256_sll_epi64(i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)), - i64x4::splat(0xFFFFFFFF0)); - + let a = i64x4::splat(0xFFFFFFFF); + let b = i64x2::splat(0).replace(0, 4); + let r = unsafe { avx2::_mm256_sll_epi64(a, b) }; + assert_eq!(r, i64x4::splat(0xFFFFFFFF0)); } #[simd_test = "avx2"] fn _mm256_slli_epi16() { assert_eq!( - avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4), + unsafe { avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4) }, i16x16::splat(0xFF0)); } #[simd_test = "avx2"] fn _mm256_slli_epi32() { assert_eq!( - avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4), + unsafe { avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4) }, i32x8::splat(0xFFFF0)); } #[simd_test = "avx2"] fn _mm256_slli_epi64() { assert_eq!( - avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4), + unsafe { avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4) }, i64x4::splat(0xFFFFFFFF0)); } @@ -1873,7 +1872,7 @@ mod tests { fn _mm_sllv_epi32() { let a = i32x4::splat(2); let b = i32x4::splat(1); - let r = avx2::_mm_sllv_epi32(a, b); + let r = unsafe { avx2::_mm_sllv_epi32(a, b) }; let e = i32x4::splat(4); assert_eq!(r, e); } @@ -1882,7 +1881,7 @@ mod tests { fn _mm256_sllv_epi32() { let a = i32x8::splat(2); let b = i32x8::splat(1); - let r = avx2::_mm256_sllv_epi32(a, b); + let r = unsafe { avx2::_mm256_sllv_epi32(a, b) }; let e = i32x8::splat(4); assert_eq!(r, e); } @@ -1891,7 +1890,7 @@ mod tests { fn _mm_sllv_epi64() { let a = i64x2::splat(2); let b = i64x2::splat(1); - let r = avx2::_mm_sllv_epi64(a, b); + let r = unsafe { avx2::_mm_sllv_epi64(a, b) }; let e = i64x2::splat(4); assert_eq!(r, e); } @@ -1900,46 +1899,46 @@ mod tests { fn _mm256_sllv_epi64() { let a = i64x4::splat(2); let b = i64x4::splat(1); - let r = avx2::_mm256_sllv_epi64(a, b); + let r = unsafe { avx2::_mm256_sllv_epi64(a, b) }; let e = i64x4::splat(4); assert_eq!(r, e); } #[simd_test = "avx2"] fn _mm256_sra_epi16() { - assert_eq!( - avx2::_mm256_sra_epi16( - i16x16::splat(-1), i16x8::new(1, 0, 0, 0, 0, 0, 0, 0)), - i16x16::splat(-1)); + let a = i16x16::splat(-1); + let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0); + let r = unsafe { avx2::_mm256_sra_epi16(a, b) }; + assert_eq!(r, i16x16::splat(-1)); } #[simd_test = "avx2"] fn _mm256_sra_epi32() { - assert_eq!( - avx2::_mm256_sra_epi32( - i32x8::splat(-1), i32x4::splat(0).replace(0,1)), - i32x8::splat(-1)); + let a = i32x8::splat(-1); + let b = i32x4::splat(0).replace(0, 1); + let r = unsafe { avx2::_mm256_sra_epi32(a, b) }; + assert_eq!(r, i32x8::splat(-1)); } #[simd_test = "avx2"] fn _mm256_srai_epi16() { - assert_eq!( - avx2::_mm256_srai_epi16( - i16x16::splat(-1), 1), i16x16::splat(-1)); + assert_eq!( + unsafe { avx2::_mm256_srai_epi16(i16x16::splat(-1), 1) }, + i16x16::splat(-1)); } #[simd_test = "avx2"] fn _mm256_srai_epi32() { - assert_eq!( - avx2::_mm256_srai_epi32( - i32x8::splat(-1), 1), i32x8::splat(-1)); + assert_eq!( + unsafe { avx2::_mm256_srai_epi32(i32x8::splat(-1), 1) }, + i32x8::splat(-1)); } #[simd_test = "avx2"] fn _mm_srav_epi32() { let a = i32x4::splat(4); let count = i32x4::splat(1); - let r = avx2::_mm_srav_epi32(a, count); + let r = unsafe { avx2::_mm_srav_epi32(a, count) }; let e = i32x4::splat(2); assert_eq!(r, e ); } @@ -1948,53 +1947,53 @@ mod tests { fn _mm256_srav_epi32() { let a = i32x8::splat(4); let count = i32x8::splat(1); - let r = avx2::_mm256_srav_epi32(a, count); + let r = unsafe { avx2::_mm256_srav_epi32(a, count) }; let e = i32x8::splat(2); assert_eq!(r, e ); } #[simd_test = "avx2"] fn _mm256_srl_epi16() { - assert_eq!( - avx2::_mm256_srl_epi16( - i16x16::splat(0xFF), i16x8::splat(0).replace(0,4)), - i16x16::splat(0xF)); + let a = i16x16::splat(0xFF); + let b = i16x8::splat(0).replace(0, 4); + let r = unsafe { avx2::_mm256_srl_epi16(a, b) }; + assert_eq!(r, i16x16::splat(0xF)); } #[simd_test = "avx2"] fn _mm256_srl_epi32() { - assert_eq!( - avx2::_mm256_srl_epi32( - i32x8::splat(0xFFFF), i32x4::splat(0).replace(0,4)), - i32x8::splat(0xFFF)); + let a = i32x8::splat(0xFFFF); + let b = i32x4::splat(0).replace(0, 4); + let r = unsafe { avx2::_mm256_srl_epi32(a, b) }; + assert_eq!(r, i32x8::splat(0xFFF)); } #[simd_test = "avx2"] fn _mm256_srl_epi64() { - assert_eq!( - avx2::_mm256_srl_epi64( - i64x4::splat(0xFFFFFFFF), i64x2::splat(0).replace(0,4)), - i64x4::splat(0xFFFFFFF)); + let a = i64x4::splat(0xFFFFFFFF); + let b = i64x2::splat(0).replace(0, 4); + let r = unsafe { avx2::_mm256_srl_epi64(a, b) }; + assert_eq!(r, i64x4::splat(0xFFFFFFF)); } #[simd_test = "avx2"] fn _mm256_srli_epi16() { assert_eq!( - avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4), + unsafe { avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4) }, i16x16::splat(0xF)); } #[simd_test = "avx2"] fn _mm256_srli_epi32() { assert_eq!( - avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4), + unsafe { avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4) }, i32x8::splat(0xFFF)); } #[simd_test = "avx2"] fn _mm256_srli_epi64() { assert_eq!( - avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4), + unsafe { avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4) }, i64x4::splat(0xFFFFFFF)); } @@ -2002,7 +2001,7 @@ mod tests { fn _mm_srlv_epi32() { let a = i32x4::splat(2); let count = i32x4::splat(1); - let r = avx2::_mm_srlv_epi32(a, count); + let r = unsafe { avx2::_mm_srlv_epi32(a, count) }; let e = i32x4::splat(1); assert_eq!(r, e); } @@ -2011,7 +2010,7 @@ mod tests { fn _mm256_srlv_epi32() { let a = i32x8::splat(2); let count = i32x8::splat(1); - let r = avx2::_mm256_srlv_epi32(a, count); + let r = unsafe { avx2::_mm256_srlv_epi32(a, count) }; let e = i32x8::splat(1); assert_eq!(r, e); } @@ -2020,7 +2019,7 @@ mod tests { fn _mm_srlv_epi64() { let a = i64x2::splat(2); let count = i64x2::splat(1); - let r = avx2::_mm_srlv_epi64(a, count); + let r = unsafe { avx2::_mm_srlv_epi64(a, count) }; let e = i64x2::splat(1); assert_eq!(r, e); } @@ -2030,7 +2029,7 @@ mod tests { fn _mm256_srlv_epi64() { let a = i64x4::splat(2); let count = i64x4::splat(1); - let r = avx2::_mm256_srlv_epi64(a, count); + let r = unsafe { avx2::_mm256_srlv_epi64(a, count) }; let e = i64x4::splat(1); assert_eq!(r, e); } @@ -2039,7 +2038,7 @@ mod tests { fn _mm256_sub_epi16() { let a = i16x16::splat(4); let b = i16x16::splat(2); - let r = avx2::_mm256_sub_epi16(a, b); + let r = unsafe { avx2::_mm256_sub_epi16(a, b) }; assert_eq!(r, b); } @@ -2047,7 +2046,7 @@ mod tests { fn _mm256_sub_epi32() { let a = i32x8::splat(4); let b = i32x8::splat(2); - let r = avx2::_mm256_sub_epi32(a, b); + let r = unsafe { avx2::_mm256_sub_epi32(a, b) }; assert_eq!(r, b); } @@ -2055,7 +2054,7 @@ mod tests { fn _mm256_sub_epi64() { let a = i64x4::splat(4); let b = i64x4::splat(2); - let r = avx2::_mm256_sub_epi64(a, b); + let r = unsafe { avx2::_mm256_sub_epi64(a, b) }; assert_eq!(r, b); } @@ -2063,7 +2062,7 @@ mod tests { fn _mm256_sub_epi8() { let a = i8x32::splat(4); let b = i8x32::splat(2); - let r = avx2::_mm256_sub_epi8(a, b); + let r = unsafe { avx2::_mm256_sub_epi8(a, b) }; assert_eq!(r, b); } @@ -2071,7 +2070,7 @@ mod tests { fn _mm256_subs_epi16() { let a = i16x16::splat(4); let b = i16x16::splat(2); - let r = avx2::_mm256_subs_epi16(a, b); + let r = unsafe { avx2::_mm256_subs_epi16(a, b) }; assert_eq!(r, b); } @@ -2079,7 +2078,7 @@ mod tests { fn _mm256_subs_epi8() { let a = i8x32::splat(4); let b = i8x32::splat(2); - let r = avx2::_mm256_subs_epi8(a, b); + let r = unsafe { avx2::_mm256_subs_epi8(a, b) }; assert_eq!(r, b); } @@ -2087,7 +2086,7 @@ mod tests { fn _mm256_subs_epu16() { let a = u16x16::splat(4); let b = u16x16::splat(2); - let r = avx2::_mm256_subs_epu16(a, b); + let r = unsafe { avx2::_mm256_subs_epu16(a, b) }; assert_eq!(r, b); } @@ -2095,14 +2094,15 @@ mod tests { fn _mm256_subs_epu8() { let a = u8x32::splat(4); let b = u8x32::splat(2); - let r = avx2::_mm256_subs_epu8(a, b); + let r = unsafe { avx2::_mm256_subs_epu8(a, b) }; assert_eq!(r, b); } #[simd_test = "avx2"] fn _mm256_xor_si256() { - assert_eq!( - avx2::_mm256_xor_si256(__m256i::splat(5), __m256i::splat(3)), - __m256i::splat(6)); + let a = __m256i::splat(5); + let b = __m256i::splat(3); + let r = unsafe { avx2::_mm256_xor_si256(a, b) }; + assert_eq!(r, __m256i::splat(6)); } } diff --git a/library/stdarch/src/x86/bmi.rs b/library/stdarch/src/x86/bmi.rs index 5bf3e8974704..a422410f918a 100644 --- a/library/stdarch/src/x86/bmi.rs +++ b/library/stdarch/src/x86/bmi.rs @@ -10,20 +10,12 @@ #[cfg(test)] use stdsimd_test::assert_instr; -#[allow(dead_code)] -extern "C" { - #[link_name="llvm.x86.bmi.bextr.32"] - fn x86_bmi_bextr_32(x: u32, y: u32) -> u32; - #[link_name="llvm.x86.bmi.bextr.64"] - fn x86_bmi_bextr_64(x: u64, y: u64) -> u64; -} - /// Extracts bits in range [`start`, `start` + `length`) from `a` into /// the least significant bits of the result. #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] -pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { +pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32)) } @@ -33,7 +25,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] #[cfg(not(target_arch = "x86"))] -pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { +pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64)) } @@ -45,8 +37,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] -pub fn _bextr2_u32(a: u32, control: u32) -> u32 { - unsafe { x86_bmi_bextr_32(a, control) } +pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 { + x86_bmi_bextr_32(a, control) } /// Extracts bits of `a` specified by `control` into @@ -58,15 +50,15 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 { #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] #[cfg(not(target_arch = "x86"))] -pub fn _bextr2_u64(a: u64, control: u64) -> u64 { - unsafe { x86_bmi_bextr_64(a, control) } +pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 { + x86_bmi_bextr_64(a, control) } /// Bitwise logical `AND` of inverted `a` with `b`. #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(andn))] -pub fn _andn_u32(a: u32, b: u32) -> u32 { +pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 { !a & b } @@ -74,7 +66,7 @@ pub fn _andn_u32(a: u32, b: u32) -> u32 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(andn))] -pub fn _andn_u64(a: u64, b: u64) -> u64 { +pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 { !a & b } @@ -82,7 +74,7 @@ pub fn _andn_u64(a: u64, b: u64) -> u64 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsi))] -pub fn _blsi_u32(x: u32) -> u32 { +pub unsafe fn _blsi_u32(x: u32) -> u32 { x & x.wrapping_neg() } @@ -91,7 +83,7 @@ pub fn _blsi_u32(x: u32) -> u32 { #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsi))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blsi_u64(x: u64) -> u64 { +pub unsafe fn _blsi_u64(x: u64) -> u64 { x & x.wrapping_neg() } @@ -99,7 +91,7 @@ pub fn _blsi_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsmsk))] -pub fn _blsmsk_u32(x: u32) -> u32 { +pub unsafe fn _blsmsk_u32(x: u32) -> u32 { x ^ (x.wrapping_sub(1u32)) } @@ -108,7 +100,7 @@ pub fn _blsmsk_u32(x: u32) -> u32 { #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blsmsk_u64(x: u64) -> u64 { +pub unsafe fn _blsmsk_u64(x: u64) -> u64 { x ^ (x.wrapping_sub(1u64)) } @@ -118,7 +110,7 @@ pub fn _blsmsk_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsr))] -pub fn _blsr_u32(x: u32) -> u32 { +pub unsafe fn _blsr_u32(x: u32) -> u32 { x & (x.wrapping_sub(1)) } @@ -129,7 +121,7 @@ pub fn _blsr_u32(x: u32) -> u32 { #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsr))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blsr_u64(x: u64) -> u64 { +pub unsafe fn _blsr_u64(x: u64) -> u64 { x & (x.wrapping_sub(1)) } @@ -139,7 +131,7 @@ pub fn _blsr_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(tzcnt))] -pub fn _tzcnt_u16(x: u16) -> u16 { +pub unsafe fn _tzcnt_u16(x: u16) -> u16 { x.trailing_zeros() as u16 } @@ -149,7 +141,7 @@ pub fn _tzcnt_u16(x: u16) -> u16 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(tzcnt))] -pub fn _tzcnt_u32(x: u32) -> u32 { +pub unsafe fn _tzcnt_u32(x: u32) -> u32 { x.trailing_zeros() } @@ -159,7 +151,7 @@ pub fn _tzcnt_u32(x: u32) -> u32 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(tzcnt))] -pub fn _tzcnt_u64(x: u64) -> u64 { +pub unsafe fn _tzcnt_u64(x: u64) -> u64 { x.trailing_zeros() as u64 } @@ -169,7 +161,7 @@ pub fn _tzcnt_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(tzcnt))] -pub fn _mm_tzcnt_u32(x: u32) -> u32 { +pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 { x.trailing_zeros() } @@ -179,10 +171,18 @@ pub fn _mm_tzcnt_u32(x: u32) -> u32 { #[inline(always)] #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(tzcnt))] -pub fn _mm_tzcnt_u64(x: u64) -> u64 { +pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 { x.trailing_zeros() as u64 } +#[allow(dead_code)] +extern "C" { + #[link_name="llvm.x86.bmi.bextr.32"] + fn x86_bmi_bextr_32(x: u32, y: u32) -> u32; + #[link_name="llvm.x86.bmi.bextr.64"] + fn x86_bmi_bextr_64(x: u64, y: u64) -> u64; +} + #[cfg(test)] mod tests { use stdsimd_test::simd_test; @@ -191,98 +191,122 @@ mod tests { #[simd_test = "bmi"] fn _bextr_u32() { - assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32); + let r = unsafe { bmi::_bextr_u32(0b0101_0000u32, 4, 4) }; + assert_eq!(r, 0b0000_0101u32); } #[simd_test = "bmi"] #[cfg(not(target_arch = "x86"))] fn _bextr_u64() { - assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64); + let r = unsafe { bmi::_bextr_u64(0b0101_0000u64, 4, 4) }; + assert_eq!(r, 0b0000_0101u64); } #[simd_test = "bmi"] fn _andn_u32() { - assert_eq!(bmi::_andn_u32(0, 0), 0); - assert_eq!(bmi::_andn_u32(0, 1), 1); - assert_eq!(bmi::_andn_u32(1, 0), 0); - assert_eq!(bmi::_andn_u32(1, 1), 0); + assert_eq!(unsafe { bmi::_andn_u32(0, 0) }, 0); + assert_eq!(unsafe { bmi::_andn_u32(0, 1) }, 1); + assert_eq!(unsafe { bmi::_andn_u32(1, 0) }, 0); + assert_eq!(unsafe { bmi::_andn_u32(1, 1) }, 0); - assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32); - assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32); - assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32); - assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32); - assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32); + let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32) }; + assert_eq!(r, 0b0000_0000u32); + + let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32) }; + assert_eq!(r, 0b1111_1111u32); + + let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32) }; + assert_eq!(r, 0b0000_0000u32); + + let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32) }; + assert_eq!(r, 0b0000_0000u32); + + let r = unsafe { bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32) }; + assert_eq!(r, 0b0001_1101u32); } #[simd_test = "bmi"] #[cfg(not(target_arch = "x86"))] fn _andn_u64() { - assert_eq!(bmi::_andn_u64(0, 0), 0); - assert_eq!(bmi::_andn_u64(0, 1), 1); - assert_eq!(bmi::_andn_u64(1, 0), 0); - assert_eq!(bmi::_andn_u64(1, 1), 0); + assert_eq!(unsafe { bmi::_andn_u64(0, 0) }, 0); + assert_eq!(unsafe { bmi::_andn_u64(0, 1) }, 1); + assert_eq!(unsafe { bmi::_andn_u64(1, 0) }, 0); + assert_eq!(unsafe { bmi::_andn_u64(1, 1) }, 0); - assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64); - assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64); - assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64); - assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64); - assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64); + let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64) }; + assert_eq!(r, 0b0000_0000u64); + + let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64) }; + assert_eq!(r, 0b1111_1111u64); + + let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64) }; + assert_eq!(r, 0b0000_0000u64); + + let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64) }; + assert_eq!(r, 0b0000_0000u64); + + let r = unsafe { bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64) }; + assert_eq!(r, 0b0001_1101u64); } #[simd_test = "bmi"] fn _blsi_u32() { - assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32); + assert_eq!(unsafe { bmi::_blsi_u32(0b1101_0000u32) }, 0b0001_0000u32); } #[simd_test = "bmi"] #[cfg(not(target_arch = "x86"))] fn _blsi_u64() { - assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64); + assert_eq!(unsafe { bmi::_blsi_u64(0b1101_0000u64) }, 0b0001_0000u64); } #[simd_test = "bmi"] fn _blsmsk_u32() { - assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32); + let r = unsafe { bmi::_blsmsk_u32(0b0011_0000u32) }; + assert_eq!(r, 0b0001_1111u32); } #[simd_test = "bmi"] #[cfg(not(target_arch = "x86"))] fn _blsmsk_u64() { - assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64); + let r = unsafe { bmi::_blsmsk_u64(0b0011_0000u64) }; + assert_eq!(r, 0b0001_1111u64); } #[simd_test = "bmi"] fn _blsr_u32() { - /// TODO: test the behavior when the input is 0 - assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32); + // TODO: test the behavior when the input is 0 + let r = unsafe { bmi::_blsr_u32(0b0011_0000u32) }; + assert_eq!(r, 0b0010_0000u32); } #[simd_test = "bmi"] #[cfg(not(target_arch = "x86"))] fn _blsr_u64() { - /// TODO: test the behavior when the input is 0 - assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64); + // TODO: test the behavior when the input is 0 + let r = unsafe { bmi::_blsr_u64(0b0011_0000u64) }; + assert_eq!(r, 0b0010_0000u64); } #[simd_test = "bmi"] fn _tzcnt_u16() { - assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16); - assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16); - assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16); + assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0001u16) }, 0u16); + assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0000u16) }, 16u16); + assert_eq!(unsafe { bmi::_tzcnt_u16(0b1001_0000u16) }, 4u16); } #[simd_test = "bmi"] fn _tzcnt_u32() { - assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32); - assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32); - assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32); + assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0001u32) }, 0u32); + assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0000u32) }, 32u32); + assert_eq!(unsafe { bmi::_tzcnt_u32(0b1001_0000u32) }, 4u32); } #[simd_test = "bmi"] #[cfg(not(target_arch = "x86"))] fn _tzcnt_u64() { - assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64); - assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64); - assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64); + assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0001u64) }, 0u64); + assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0000u64) }, 64u64); + assert_eq!(unsafe { bmi::_tzcnt_u64(0b1001_0000u64) }, 4u64); } } diff --git a/library/stdarch/src/x86/bmi2.rs b/library/stdarch/src/x86/bmi2.rs index 09afe87e469a..7779fac68f50 100644 --- a/library/stdarch/src/x86/bmi2.rs +++ b/library/stdarch/src/x86/bmi2.rs @@ -19,7 +19,7 @@ use stdsimd_test::assert_instr; #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))] #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))] #[target_feature = "+bmi2"] -pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) { +pub unsafe fn _mulx_u32(a: u32, b: u32) -> (u32, u32) { let result: u64 = (a as u64) * (b as u64); let hi = (result >> 32) as u32; (result as u32, hi) @@ -33,12 +33,67 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) { #[cfg_attr(test, assert_instr(mulx))] #[target_feature = "+bmi2"] #[cfg(not(target_arch = "x86"))] // calls an intrinsic -pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) { +pub unsafe fn _mulx_u64(a: u64, b: u64) -> (u64, u64) { let result: u128 = (a as u128) * (b as u128); let hi = (result >> 64) as u64; (result as u64, hi) } +/// Zero higher bits of `a` >= `index`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(bzhi))] +pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 { + x86_bmi2_bzhi_32(a, index) +} + +/// Zero higher bits of `a` >= `index`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(bzhi))] +#[cfg(not(target_arch = "x86"))] +pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 { + x86_bmi2_bzhi_64(a, index) +} + +/// Scatter contiguous low order bits of `a` to the result at the positions +/// specified by the `mask`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pdep))] +pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 { + x86_bmi2_pdep_32(a, mask) +} + +/// Scatter contiguous low order bits of `a` to the result at the positions +/// specified by the `mask`. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pdep))] +#[cfg(not(target_arch = "x86"))] +pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 { + x86_bmi2_pdep_64(a, mask) +} + +/// Gathers the bits of `x` specified by the `mask` into the contiguous low +/// order bit positions of the result. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pext))] +pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 { + x86_bmi2_pext_32(a, mask) +} + +/// Gathers the bits of `x` specified by the `mask` into the contiguous low +/// order bit positions of the result. +#[inline(always)] +#[target_feature = "+bmi2"] +#[cfg_attr(test, assert_instr(pext))] +#[cfg(not(target_arch = "x86"))] +pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 { + x86_bmi2_pext_64(a, mask) +} + #[allow(dead_code)] extern "C" { #[link_name="llvm.x86.bmi.bzhi.32"] @@ -55,63 +110,6 @@ extern "C" { fn x86_bmi2_pext_64(x: u64, y: u64) -> u64; } - -/// Zero higher bits of `a` >= `index`. -#[inline(always)] -#[target_feature = "+bmi2"] -#[cfg_attr(test, assert_instr(bzhi))] -pub fn _bzhi_u32(a: u32, index: u32) -> u32 { - unsafe { x86_bmi2_bzhi_32(a, index) } -} - -/// Zero higher bits of `a` >= `index`. -#[inline(always)] -#[target_feature = "+bmi2"] -#[cfg_attr(test, assert_instr(bzhi))] -#[cfg(not(target_arch = "x86"))] -pub fn _bzhi_u64(a: u64, index: u64) -> u64 { - unsafe { x86_bmi2_bzhi_64(a, index) } -} - - -/// Scatter contiguous low order bits of `a` to the result at the positions -/// specified by the `mask`. -#[inline(always)] -#[target_feature = "+bmi2"] -#[cfg_attr(test, assert_instr(pdep))] -pub fn _pdep_u32(a: u32, mask: u32) -> u32 { - unsafe { x86_bmi2_pdep_32(a, mask) } -} - -/// Scatter contiguous low order bits of `a` to the result at the positions -/// specified by the `mask`. -#[inline(always)] -#[target_feature = "+bmi2"] -#[cfg_attr(test, assert_instr(pdep))] -#[cfg(not(target_arch = "x86"))] -pub fn _pdep_u64(a: u64, mask: u64) -> u64 { - unsafe { x86_bmi2_pdep_64(a, mask) } -} - -/// Gathers the bits of `x` specified by the `mask` into the contiguous low -/// order bit positions of the result. -#[inline(always)] -#[target_feature = "+bmi2"] -#[cfg_attr(test, assert_instr(pext))] -pub fn _pext_u32(a: u32, mask: u32) -> u32 { - unsafe { x86_bmi2_pext_32(a, mask) } -} - -/// Gathers the bits of `x` specified by the `mask` into the contiguous low -/// order bit positions of the result. -#[inline(always)] -#[target_feature = "+bmi2"] -#[cfg_attr(test, assert_instr(pext))] -#[cfg(not(target_arch = "x86"))] -pub fn _pext_u64(a: u64, mask: u64) -> u64 { - unsafe { x86_bmi2_pext_64(a, mask) } -} - #[cfg(test)] mod tests { use stdsimd_test::simd_test; @@ -128,8 +126,8 @@ mod tests { let m1 = 0b1110_1011_1110_1111u32; let s1 = 0b0001_0111_0100_0011u32; - assert_eq!(bmi2::_pext_u32(n, m0), s0); - assert_eq!(bmi2::_pext_u32(n, m1), s1); + assert_eq!(unsafe { bmi2::_pext_u32(n, m0) }, s0); + assert_eq!(unsafe { bmi2::_pext_u32(n, m1) }, s1); } #[simd_test = "bmi2"] @@ -143,8 +141,8 @@ mod tests { let m1 = 0b1110_1011_1110_1111u64; let s1 = 0b0001_0111_0100_0011u64; - assert_eq!(bmi2::_pext_u64(n, m0), s0); - assert_eq!(bmi2::_pext_u64(n, m1), s1); + assert_eq!(unsafe { bmi2::_pext_u64(n, m0) }, s0); + assert_eq!(unsafe { bmi2::_pext_u64(n, m1) }, s1); } #[simd_test = "bmi2"] @@ -157,8 +155,8 @@ mod tests { let m1 = 0b1110_1011_1110_1111u32; let s1 = 0b1110_1001_0010_0011u32; - assert_eq!(bmi2::_pdep_u32(n, m0), s0); - assert_eq!(bmi2::_pdep_u32(n, m1), s1); + assert_eq!(unsafe { bmi2::_pdep_u32(n, m0) }, s0); + assert_eq!(unsafe { bmi2::_pdep_u32(n, m1) }, s1); } #[simd_test = "bmi2"] @@ -172,15 +170,15 @@ mod tests { let m1 = 0b1110_1011_1110_1111u64; let s1 = 0b1110_1001_0010_0011u64; - assert_eq!(bmi2::_pdep_u64(n, m0), s0); - assert_eq!(bmi2::_pdep_u64(n, m1), s1); + assert_eq!(unsafe { bmi2::_pdep_u64(n, m0) }, s0); + assert_eq!(unsafe { bmi2::_pdep_u64(n, m1) }, s1); } #[simd_test = "bmi2"] fn _bzhi_u32() { let n = 0b1111_0010u32; let s = 0b0001_0010u32; - assert_eq!(bmi2::_bzhi_u32(n, 5), s); + assert_eq!(unsafe { bmi2::_bzhi_u32(n, 5) }, s); } #[simd_test = "bmi2"] @@ -188,14 +186,14 @@ mod tests { fn _bzhi_u64() { let n = 0b1111_0010u64; let s = 0b0001_0010u64; - assert_eq!(bmi2::_bzhi_u64(n, 5), s); + assert_eq!(unsafe { bmi2::_bzhi_u64(n, 5) }, s); } #[simd_test = "bmi2"] fn _mulx_u32() { let a: u32 = 4_294_967_200; let b: u32 = 2; - let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b); + let (lo, hi): (u32, u32) = unsafe { bmi2::_mulx_u32(a, b) }; // result = 8589934400 // = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64 // ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -208,7 +206,7 @@ mod tests { fn _mulx_u64() { let a: u64 = 9_223_372_036_854_775_800; let b: u64 = 100; - let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b); + let (lo, hi): (u64, u64) = unsafe { bmi2::_mulx_u64(a, b) }; // result = 922337203685477580000 // = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128 // ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/library/stdarch/src/x86/sse.rs b/library/stdarch/src/x86/sse.rs index e164adcb6efd..b557cd0445ce 100644 --- a/library/stdarch/src/x86/sse.rs +++ b/library/stdarch/src/x86/sse.rs @@ -9,15 +9,15 @@ use stdsimd_test::assert_instr; #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(addss))] -pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 { - unsafe { addss(a, b) } +pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 { + addss(a, b) } /// Adds f32x4 vectors. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(addps))] -pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 { a + b } @@ -26,15 +26,15 @@ pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(subss))] -pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 { - unsafe { subss(a, b) } +pub unsafe fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 { + subss(a, b) } /// Subtracts f32x4 vectors. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(subps))] -pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 { a - b } @@ -43,15 +43,15 @@ pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(mulss))] -pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 { - unsafe { mulss(a, b) } +pub unsafe fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 { + mulss(a, b) } /// Multiplies f32x4 vectors. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(mulps))] -pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 { a * b } @@ -60,15 +60,15 @@ pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(divss))] -pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 { - unsafe { divss(a, b) } +pub unsafe fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 { + divss(a, b) } /// Divides f32x4 vectors. #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(divps))] -pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 { a / b } @@ -77,8 +77,8 @@ pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(sqrtss))] -pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 { - unsafe { sqrtss(a) } +pub unsafe fn _mm_sqrt_ss(a: f32x4) -> f32x4 { + sqrtss(a) } /// Return the square root of packed single-precision (32-bit) floating-point @@ -86,8 +86,8 @@ pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(sqrtps))] -pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 { - unsafe { sqrtps(a) } +pub unsafe fn _mm_sqrt_ps(a: f32x4) -> f32x4 { + sqrtps(a) } /// Return the approximate reciprocal of the first single-precision @@ -95,8 +95,8 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(rcpss))] -pub fn _mm_rcp_ss(a: f32x4) -> f32x4 { - unsafe { rcpss(a) } +pub unsafe fn _mm_rcp_ss(a: f32x4) -> f32x4 { + rcpss(a) } /// Return the approximate reciprocal of packed single-precision (32-bit) @@ -104,8 +104,8 @@ pub fn _mm_rcp_ss(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(rcpps))] -pub fn _mm_rcp_ps(a: f32x4) -> f32x4 { - unsafe { rcpps(a) } +pub unsafe fn _mm_rcp_ps(a: f32x4) -> f32x4 { + rcpps(a) } /// Return the approximate reciprocal square root of the fist single-precision @@ -113,8 +113,8 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(rsqrtss))] -pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 { - unsafe { rsqrtss(a) } +pub unsafe fn _mm_rsqrt_ss(a: f32x4) -> f32x4 { + rsqrtss(a) } /// Return the approximate reciprocal square root of packed single-precision @@ -122,8 +122,8 @@ pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(rsqrtps))] -pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 { - unsafe { rsqrtps(a) } +pub unsafe fn _mm_rsqrt_ps(a: f32x4) -> f32x4 { + rsqrtps(a) } /// Compare the first single-precision (32-bit) floating-point element of `a` @@ -132,8 +132,8 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(minss))] -pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 { - unsafe { minss(a, b) } +pub unsafe fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 { + minss(a, b) } /// Compare packed single-precision (32-bit) floating-point elements in `a` and @@ -141,8 +141,8 @@ pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(minps))] -pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 { - unsafe { minps(a, b) } +pub unsafe fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 { + minps(a, b) } /// Compare the first single-precision (32-bit) floating-point element of `a` @@ -151,8 +151,8 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(maxss))] -pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 { - unsafe { maxss(a, b) } +pub unsafe fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 { + maxss(a, b) } /// Compare packed single-precision (32-bit) floating-point elements in `a` and @@ -160,24 +160,23 @@ pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(maxps))] -pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 { - unsafe { maxps(a, b) } +pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 { + maxps(a, b) } -// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b` -// using `mask`. -// The lower half of result takes values from `a` and the higher half from `b`. -// Mask is split to 2 control bits each to index the element from inputs. +/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and +/// `b` using `mask`. +/// +/// The lower half of result takes values from `a` and the higher half from +/// `b`. Mask is split to 2 control bits each to index the element from inputs. #[inline(always)] #[target_feature = "+sse"] -pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 { +pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 { let mask = (mask & 0xFF) as u8; macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { - unsafe { - simd_shuffle4(a, b, [$x01, $x23, $x45, $x67]) - } + simd_shuffle4(a, b, [$x01, $x23, $x45, $x67]) } } macro_rules! shuffle_x67 { @@ -219,10 +218,10 @@ pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 { } #[cfg(test)] -#[cfg_attr(test, assert_instr(shufps))] #[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(shufps))] fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 { - _mm_shuffle_ps(a, b, 3) + unsafe { _mm_shuffle_ps(a, b, 3) } } /// Unpack and interleave single-precision (32-bit) floating-point elements @@ -230,8 +229,8 @@ fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(unpckhps))] -pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 { - unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) } +pub unsafe fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 { + simd_shuffle4(a, b, [2, 6, 3, 7]) } /// Unpack and interleave single-precision (32-bit) floating-point elements @@ -239,8 +238,8 @@ pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(unpcklps))] -pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 { - unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) } +pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 { + simd_shuffle4(a, b, [0, 4, 1, 5]) } /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower @@ -249,9 +248,9 @@ pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 { #[target_feature = "+sse"] #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))] #[cfg_attr(all(test, windows), assert_instr(unpckhpd))] -pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 { +pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 { // TODO; figure why this is a different instruction on Windows? - unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) } + simd_shuffle4(a, b, [6, 7, 2, 3]) } /// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher @@ -259,8 +258,8 @@ pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(unpcklpd))] -pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 { - unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) } +pub unsafe fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 { + simd_shuffle4(a, b, [0, 1, 4, 5]) } /// Return a mask of the most significant bit of each element in `a`. @@ -270,8 +269,8 @@ pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(movmskps))] -pub fn _mm_movemask_ps(a: f32x4) -> i32 { - unsafe { movmskps(a) } +pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 { + movmskps(a) } #[allow(improper_ctypes)] @@ -318,7 +317,7 @@ mod tests { fn _mm_add_ps() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_add_ps(a, b); + let r = unsafe { sse::_mm_add_ps(a, b) }; assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0)); } @@ -326,7 +325,7 @@ mod tests { fn _mm_add_ss() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_add_ss(a, b); + let r = unsafe { sse::_mm_add_ss(a, b) }; assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0)); } @@ -334,7 +333,7 @@ mod tests { fn _mm_sub_ps() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_sub_ps(a, b); + let r = unsafe { sse::_mm_sub_ps(a, b) }; assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0)); } @@ -342,7 +341,7 @@ mod tests { fn _mm_sub_ss() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_sub_ss(a, b); + let r = unsafe { sse::_mm_sub_ss(a, b) }; assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0)); } @@ -350,7 +349,7 @@ mod tests { fn _mm_mul_ps() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_mul_ps(a, b); + let r = unsafe { sse::_mm_mul_ps(a, b) }; assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0)); } @@ -358,7 +357,7 @@ mod tests { fn _mm_mul_ss() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_mul_ss(a, b); + let r = unsafe { sse::_mm_mul_ss(a, b) }; assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0)); } @@ -366,7 +365,7 @@ mod tests { fn _mm_div_ps() { let a = f32x4::new(-1.0, 5.0, 2.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.2, -5.0); - let r = sse::_mm_div_ps(a, b); + let r = unsafe { sse::_mm_div_ps(a, b) }; assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0)); } @@ -374,14 +373,14 @@ mod tests { fn _mm_div_ss() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_div_ss(a, b); + let r = unsafe { sse::_mm_div_ss(a, b) }; assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0)); } #[simd_test = "sse"] fn _mm_sqrt_ss() { let a = f32x4::new(4.0, 13.0, 16.0, 100.0); - let r = sse::_mm_sqrt_ss(a); + let r = unsafe { sse::_mm_sqrt_ss(a) }; let e = f32x4::new(2.0, 13.0, 16.0, 100.0); assert_eq!(r, e); } @@ -389,7 +388,7 @@ mod tests { #[simd_test = "sse"] fn _mm_sqrt_ps() { let a = f32x4::new(4.0, 13.0, 16.0, 100.0); - let r = sse::_mm_sqrt_ps(a); + let r = unsafe { sse::_mm_sqrt_ps(a) }; let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0); assert_eq!(r, e); } @@ -397,7 +396,7 @@ mod tests { #[simd_test = "sse"] fn _mm_rcp_ss() { let a = f32x4::new(4.0, 13.0, 16.0, 100.0); - let r = sse::_mm_rcp_ss(a); + let r = unsafe { sse::_mm_rcp_ss(a) }; let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0); assert_eq!(r, e); } @@ -405,7 +404,7 @@ mod tests { #[simd_test = "sse"] fn _mm_rcp_ps() { let a = f32x4::new(4.0, 13.0, 16.0, 100.0); - let r = sse::_mm_rcp_ps(a); + let r = unsafe { sse::_mm_rcp_ps(a) }; let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215); assert_eq!(r, e); } @@ -413,7 +412,7 @@ mod tests { #[simd_test = "sse"] fn _mm_rsqrt_ss() { let a = f32x4::new(4.0, 13.0, 16.0, 100.0); - let r = sse::_mm_rsqrt_ss(a); + let r = unsafe { sse::_mm_rsqrt_ss(a) }; let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0); assert_eq!(r, e); } @@ -421,7 +420,7 @@ mod tests { #[simd_test = "sse"] fn _mm_rsqrt_ps() { let a = f32x4::new(4.0, 13.0, 16.0, 100.0); - let r = sse::_mm_rsqrt_ps(a); + let r = unsafe { sse::_mm_rsqrt_ps(a) }; let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845); assert_eq!(r, e); } @@ -430,7 +429,7 @@ mod tests { fn _mm_min_ss() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_min_ss(a, b); + let r = unsafe { sse::_mm_min_ss(a, b) }; assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0)); } @@ -438,7 +437,7 @@ mod tests { fn _mm_min_ps() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_min_ps(a, b); + let r = unsafe { sse::_mm_min_ps(a, b) }; assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0)); } @@ -446,7 +445,7 @@ mod tests { fn _mm_max_ss() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_max_ss(a, b); + let r = unsafe { sse::_mm_max_ss(a, b) }; assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0)); } @@ -454,7 +453,7 @@ mod tests { fn _mm_max_ps() { let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); - let r = sse::_mm_max_ps(a, b); + let r = unsafe { sse::_mm_max_ps(a, b) }; assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0)); } @@ -463,7 +462,7 @@ mod tests { let a = f32x4::new(1.0, 2.0, 3.0, 4.0); let b = f32x4::new(5.0, 6.0, 7.0, 8.0); let mask = 0b00_01_01_11; - let r = sse::_mm_shuffle_ps(a, b, mask); + let r = unsafe { sse::_mm_shuffle_ps(a, b, mask) }; assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0)); } @@ -471,7 +470,7 @@ mod tests { fn _mm_unpackhi_ps() { let a = f32x4::new(1.0, 2.0, 3.0, 4.0); let b = f32x4::new(5.0, 6.0, 7.0, 8.0); - let r = sse::_mm_unpackhi_ps(a, b); + let r = unsafe { sse::_mm_unpackhi_ps(a, b) }; assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0)); } @@ -479,7 +478,7 @@ mod tests { fn _mm_unpacklo_ps() { let a = f32x4::new(1.0, 2.0, 3.0, 4.0); let b = f32x4::new(5.0, 6.0, 7.0, 8.0); - let r = sse::_mm_unpacklo_ps(a, b); + let r = unsafe { sse::_mm_unpacklo_ps(a, b) }; assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0)); } @@ -487,7 +486,7 @@ mod tests { fn _mm_movehl_ps() { let a = f32x4::new(1.0, 2.0, 3.0, 4.0); let b = f32x4::new(5.0, 6.0, 7.0, 8.0); - let r = sse::_mm_movehl_ps(a, b); + let r = unsafe { sse::_mm_movehl_ps(a, b) }; assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0)); } @@ -495,16 +494,20 @@ mod tests { fn _mm_movelh_ps() { let a = f32x4::new(1.0, 2.0, 3.0, 4.0); let b = f32x4::new(5.0, 6.0, 7.0, 8.0); - let r = sse::_mm_movelh_ps(a, b); + let r = unsafe { sse::_mm_movelh_ps(a, b) }; assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0)); } #[simd_test = "sse"] fn _mm_movemask_ps() { - let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0)); + let r = unsafe { + sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0)) + }; assert_eq!(r, 0b0101); - let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0)); + let r = unsafe { + sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0)) + }; assert_eq!(r, 0b0111); } } diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs index d2079bd5af0d..80fac197facb 100644 --- a/library/stdarch/src/x86/sse2.rs +++ b/library/stdarch/src/x86/sse2.rs @@ -1,3 +1,6 @@ +#[cfg(test)] +use stdsimd_test::assert_instr; + use std::mem; use std::os::raw::c_void; use std::ptr; @@ -9,23 +12,22 @@ use x86::__m128i; use v128::*; use v64::*; -#[cfg(test)] -use stdsimd_test::assert_instr; - /// Provide a hint to the processor that the code sequence is a spin-wait loop. /// /// This can help improve the performance and power consumption of spin-wait /// loops. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_pause() { - unsafe { pause() } +#[cfg_attr(test, assert_instr(pause))] +pub unsafe fn _mm_pause() { + pause() } /// Invalidate and flush the cache line that contains `p` from all levels of /// the cache hierarchy. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(clflush))] pub unsafe fn _mm_clflush(p: *mut c_void) { clflush(p) } @@ -38,8 +40,9 @@ pub unsafe fn _mm_clflush(p: *mut c_void) { /// program order. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_lfence() { - unsafe { lfence() } +#[cfg_attr(test, assert_instr(lfence))] +pub unsafe fn _mm_lfence() { + lfence() } /// Perform a serializing operation on all load-from-memory and store-to-memory @@ -50,79 +53,89 @@ pub fn _mm_lfence() { /// which follows the fence in program order. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mfence() { - unsafe { mfence() } +#[cfg_attr(test, assert_instr(mfence))] +pub unsafe fn _mm_mfence() { + mfence() } /// Add packed 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_add_epi8(a: i8x16, b: i8x16) -> i8x16 { +#[cfg_attr(test, assert_instr(paddb))] +pub unsafe fn _mm_add_epi8(a: i8x16, b: i8x16) -> i8x16 { a + b } /// Add packed 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_add_epi16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(test, assert_instr(paddw))] +pub unsafe fn _mm_add_epi16(a: i16x8, b: i16x8) -> i16x8 { a + b } /// Add packed 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_add_epi32(a: i32x4, b: i32x4) -> i32x4 { +#[cfg_attr(test, assert_instr(paddd))] +pub unsafe fn _mm_add_epi32(a: i32x4, b: i32x4) -> i32x4 { a + b } /// Add packed 64-bit integers in `a` and "b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_add_epi64(a: i64x2, b: i64x2) -> i64x2 { +#[cfg_attr(test, assert_instr(paddq))] +pub unsafe fn _mm_add_epi64(a: i64x2, b: i64x2) -> i64x2 { a + b } /// Add packed 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { paddsb(a, b) } +#[cfg_attr(test, assert_instr(paddsb))] +pub unsafe fn _mm_adds_epi8(a: i8x16, b: i8x16) -> i8x16 { + paddsb(a, b) } /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(paddsw))] -pub fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { paddsw(a, b) } +pub unsafe fn _mm_adds_epi16(a: i16x8, b: i16x8) -> i16x8 { + paddsw(a, b) } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_adds_epu8(a: u8x16, b: u8x16) -> u8x16 { - unsafe { paddsub(a, b) } +#[cfg_attr(test, assert_instr(paddusb))] +pub unsafe fn _mm_adds_epu8(a: u8x16, b: u8x16) -> u8x16 { + paddsub(a, b) } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_adds_epu16(a: u16x8, b: u16x8) -> u16x8 { - unsafe { paddsuw(a, b) } +#[cfg_attr(test, assert_instr(paddusw))] +pub unsafe fn _mm_adds_epu16(a: u16x8, b: u16x8) -> u16x8 { + paddsuw(a, b) } /// Average packed unsigned 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_avg_epu8(a: u8x16, b: u8x16) -> u8x16 { - unsafe { pavgb(a, b) } +#[cfg_attr(test, assert_instr(pavgb))] +pub unsafe fn _mm_avg_epu8(a: u8x16, b: u8x16) -> u8x16 { + pavgb(a, b) } /// Average packed unsigned 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 { - unsafe { pavgw(a, b) } +#[cfg_attr(test, assert_instr(pavgw))] +pub unsafe fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 { + pavgw(a, b) } /// Multiply and then horizontally add signed 16 bit integers in `a` and `b`. @@ -132,40 +145,45 @@ pub fn _mm_avg_epu16(a: u16x8, b: u16x8) -> u16x8 { /// intermediate 32-bit integers. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_madd_epi16(a: i16x8, b: i16x8) -> i32x4 { - unsafe { pmaddwd(a, b) } +#[cfg_attr(test, assert_instr(pmaddwd))] +pub unsafe fn _mm_madd_epi16(a: i16x8, b: i16x8) -> i32x4 { + pmaddwd(a, b) } /// Compare packed 16-bit integers in `a` and `b`, and return the packed /// maximum values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_max_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { pmaxsw(a, b) } +#[cfg_attr(test, assert_instr(pmaxsw))] +pub unsafe fn _mm_max_epi16(a: i16x8, b: i16x8) -> i16x8 { + pmaxsw(a, b) } /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the /// packed maximum values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_max_epu8(a: u8x16, b: u8x16) -> u8x16 { - unsafe { pmaxub(a, b) } +#[cfg_attr(test, assert_instr(pmaxub))] +pub unsafe fn _mm_max_epu8(a: u8x16, b: u8x16) -> u8x16 { + pmaxub(a, b) } /// Compare packed 16-bit integers in `a` and `b`, and return the packed /// minimum values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_min_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { pminsw(a, b) } +#[cfg_attr(test, assert_instr(pminsw))] +pub unsafe fn _mm_min_epi16(a: i16x8, b: i16x8) -> i16x8 { + pminsw(a, b) } /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the /// packed minimum values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 { - unsafe { pminub(a, b) } +#[cfg_attr(test, assert_instr(pminub))] +pub unsafe fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 { + pminub(a, b) } /// Multiply the packed 16-bit integers in `a` and `b`. @@ -174,8 +192,9 @@ pub fn _mm_min_epu8(a: u8x16, b: u8x16) -> u8x16 { /// high 16 bits of the intermediate integers. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { pmulhw(a, b) } +#[cfg_attr(test, assert_instr(pmulhw))] +pub unsafe fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 { + pmulhw(a, b) } /// Multiply the packed unsigned 16-bit integers in `a` and `b`. @@ -184,8 +203,9 @@ pub fn _mm_mulhi_epi16(a: i16x8, b: i16x8) -> i16x8 { /// high 16 bits of the intermediate integers. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 { - unsafe { pmulhuw(a, b) } +#[cfg_attr(test, assert_instr(pmulhuw))] +pub unsafe fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 { + pmulhuw(a, b) } /// Multiply the packed 16-bit integers in `a` and `b`. @@ -194,7 +214,8 @@ pub fn _mm_mulhi_epu16(a: u16x8, b: u16x8) -> u16x8 { /// low 16 bits of the intermediate integers. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(test, assert_instr(pmullw))] +pub unsafe fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 { a * b } @@ -204,8 +225,9 @@ pub fn _mm_mullo_epi16(a: i16x8, b: i16x8) -> i16x8 { /// Return the unsigned 64-bit results. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 { - unsafe { pmuludq(a, b) } +#[cfg_attr(test, assert_instr(pmuludq))] +pub unsafe fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 { + pmuludq(a, b) } /// Sum the absolute differences of packed unsigned 8-bit integers. @@ -216,35 +238,40 @@ pub fn _mm_mul_epu32(a: u32x4, b: u32x4) -> u64x2 { /// the low 16 bits of 64-bit elements returned. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sad_epu8(a: u8x16, b: u8x16) -> u64x2 { - unsafe { psadbw(a, b) } +#[cfg_attr(test, assert_instr(psadbw))] +pub unsafe fn _mm_sad_epu8(a: u8x16, b: u8x16) -> u64x2 { + psadbw(a, b) } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sub_epi8(a: i8x16, b: i8x16) -> i8x16 { +#[cfg_attr(test, assert_instr(psubb))] +pub unsafe fn _mm_sub_epi8(a: i8x16, b: i8x16) -> i8x16 { a - b } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sub_epi16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(test, assert_instr(psubw))] +pub unsafe fn _mm_sub_epi16(a: i16x8, b: i16x8) -> i16x8 { a - b } /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sub_epi32(a: i32x4, b: i32x4) -> i32x4 { +#[cfg_attr(test, assert_instr(psubd))] +pub unsafe fn _mm_sub_epi32(a: i32x4, b: i32x4) -> i32x4 { a - b } /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 { +#[cfg_attr(test, assert_instr(psubq))] +pub unsafe fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 { a - b } @@ -252,54 +279,56 @@ pub fn _mm_sub_epi64(a: i64x2, b: i64x2) -> i64x2 { /// using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_subs_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { psubsb(a, b) } +#[cfg_attr(test, assert_instr(psubsb))] +pub unsafe fn _mm_subs_epi8(a: i8x16, b: i8x16) -> i8x16 { + psubsb(a, b) } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` /// using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_subs_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { psubsw(a, b) } +#[cfg_attr(test, assert_instr(psubsw))] +pub unsafe fn _mm_subs_epi16(a: i16x8, b: i16x8) -> i16x8 { + psubsw(a, b) } /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit /// integers in `a` using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_subs_epu8(a: u8x16, b: u8x16) -> u8x16 { - unsafe { psubusb(a, b) } +#[cfg_attr(test, assert_instr(psubusb))] +pub unsafe fn _mm_subs_epu8(a: u8x16, b: u8x16) -> u8x16 { + psubusb(a, b) } /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit /// integers in `a` using saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 { - unsafe { psubusw(a, b) } +#[cfg_attr(test, assert_instr(psubusw))] +pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 { + psubusw(a, b) } /// Shift `a` left by `imm8` bytes while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { +pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { let (zero, imm8) = (__m128i::splat(0), imm8 as u32); const fn sub(a: u32, b: u32) -> u32 { a - b } macro_rules! shuffle { ($shift:expr) => { - unsafe { - simd_shuffle16::<__m128i, __m128i>(zero, a, [ - sub(16, $shift), sub(17, $shift), - sub(18, $shift), sub(19, $shift), - sub(20, $shift), sub(21, $shift), - sub(22, $shift), sub(23, $shift), - sub(24, $shift), sub(25, $shift), - sub(26, $shift), sub(27, $shift), - sub(28, $shift), sub(29, $shift), - sub(30, $shift), sub(31, $shift), - ]) - } + simd_shuffle16::<__m128i, __m128i>(zero, a, [ + sub(16, $shift), sub(17, $shift), + sub(18, $shift), sub(19, $shift), + sub(20, $shift), sub(21, $shift), + sub(22, $shift), sub(23, $shift), + sub(24, $shift), sub(25, $shift), + sub(26, $shift), sub(27, $shift), + sub(28, $shift), sub(29, $shift), + sub(30, $shift), sub(31, $shift), + ]) } } match imm8 { @@ -315,117 +344,146 @@ pub fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { } } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pslldq))] +fn _test_mm_slli_si128(a: __m128i) -> __m128i { + unsafe { _mm_slli_si128(a, 1) } +} + /// Shift `a` left by `imm8` bytes while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i { +pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i { _mm_slli_si128(a, imm8) } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pslldq))] +fn _test_mm_bslli_si128(a: __m128i) -> __m128i { + unsafe { _mm_bslli_si128(a, 1) } +} + /// Shift `a` right by `imm8` bytes while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i { +pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i { _mm_srli_si128(a, imm8) } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(psrldq))] +fn _test_mm_bsrli_si128(a: __m128i) -> __m128i { + unsafe { _mm_bsrli_si128(a, 1) } +} + /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 { - unsafe { pslliw(a, imm8) } +#[cfg_attr(test, assert_instr(psllw))] +pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 { + pslliw(a, imm8) } /// Shift packed 16-bit integers in `a` left by `count` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sll_epi16(a: i16x8, count: i16x8) -> i16x8 { - unsafe { psllw(a, count) } +#[cfg_attr(test, assert_instr(psllw))] +pub unsafe fn _mm_sll_epi16(a: i16x8, count: i16x8) -> i16x8 { + psllw(a, count) } /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_slli_epi32(a: i32x4, imm8: i32) -> i32x4 { - unsafe { psllid(a, imm8) } +#[cfg_attr(test, assert_instr(pslld))] +pub unsafe fn _mm_slli_epi32(a: i32x4, imm8: i32) -> i32x4 { + psllid(a, imm8) } /// Shift packed 32-bit integers in `a` left by `count` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sll_epi32(a: i32x4, count: i32x4) -> i32x4 { - unsafe { pslld(a, count) } +#[cfg_attr(test, assert_instr(pslld))] +pub unsafe fn _mm_sll_epi32(a: i32x4, count: i32x4) -> i32x4 { + pslld(a, count) } /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_slli_epi64(a: i64x2, imm8: i32) -> i64x2 { - unsafe { pslliq(a, imm8) } +#[cfg_attr(test, assert_instr(psllq))] +pub unsafe fn _mm_slli_epi64(a: i64x2, imm8: i32) -> i64x2 { + pslliq(a, imm8) } /// Shift packed 64-bit integers in `a` left by `count` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sll_epi64(a: i64x2, count: i64x2) -> i64x2 { - unsafe { psllq(a, count) } +#[cfg_attr(test, assert_instr(psllq))] +pub unsafe fn _mm_sll_epi64(a: i64x2, count: i64x2) -> i64x2 { + psllq(a, count) } /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign /// bits. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srai_epi16(a: i16x8, imm8: i32) -> i16x8 { - unsafe { psraiw(a, imm8) } +#[cfg_attr(test, assert_instr(psraw))] +pub unsafe fn _mm_srai_epi16(a: i16x8, imm8: i32) -> i16x8 { + psraiw(a, imm8) } /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign /// bits. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sra_epi16(a: i16x8, count: i16x8) -> i16x8 { - unsafe { psraw(a, count) } +#[cfg_attr(test, assert_instr(psraw))] +pub unsafe fn _mm_sra_epi16(a: i16x8, count: i16x8) -> i16x8 { + psraw(a, count) } /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign /// bits. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srai_epi32(a: i32x4, imm8: i32) -> i32x4 { - unsafe { psraid(a, imm8) } +#[cfg_attr(test, assert_instr(psrad))] +pub unsafe fn _mm_srai_epi32(a: i32x4, imm8: i32) -> i32x4 { + psraid(a, imm8) } /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign /// bits. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 { - unsafe { psrad(a, count) } +#[cfg_attr(test, assert_instr(psrad))] +pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 { + psrad(a, count) } /// Shift `a` right by `imm8` bytes while shifting in zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { +pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { let (zero, imm8) = (__m128i::splat(0), imm8 as u32); const fn add(a: u32, b: u32) -> u32 { a + b } macro_rules! shuffle { ($shift:expr) => { - unsafe { - simd_shuffle16::<__m128i, __m128i>(a, zero, [ - add(0, $shift), add(1, $shift), - add(2, $shift), add(3, $shift), - add(4, $shift), add(5, $shift), - add(6, $shift), add(7, $shift), - add(8, $shift), add(9, $shift), - add(10, $shift), add(11, $shift), - add(12, $shift), add(13, $shift), - add(14, $shift), add(15, $shift), - ]) - } + simd_shuffle16::<__m128i, __m128i>(a, zero, [ + add(0, $shift), add(1, $shift), + add(2, $shift), add(3, $shift), + add(4, $shift), add(5, $shift), + add(6, $shift), add(7, $shift), + add(8, $shift), add(9, $shift), + add(10, $shift), add(11, $shift), + add(12, $shift), add(13, $shift), + add(14, $shift), add(15, $shift), + ]) } } match imm8 { @@ -441,59 +499,73 @@ pub fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { } } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(psrldq))] +fn _test_mm_srli_si128(a: __m128i) -> __m128i { + unsafe { _mm_srli_si128(a, 1) } +} + /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 { - unsafe { psrliw(a, imm8) } +#[cfg_attr(test, assert_instr(psrlw))] +pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 { + psrliw(a, imm8) } /// Shift packed 16-bit integers in `a` right by `count` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srl_epi16(a: i16x8, count: i16x8) -> i16x8 { - unsafe { psrlw(a, count) } +#[cfg_attr(test, assert_instr(psrlw))] +pub unsafe fn _mm_srl_epi16(a: i16x8, count: i16x8) -> i16x8 { + psrlw(a, count) } /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srli_epi32(a: i32x4, imm8: i32) -> i32x4 { - unsafe { psrlid(a, imm8) } +#[cfg_attr(test, assert_instr(psrld))] +pub unsafe fn _mm_srli_epi32(a: i32x4, imm8: i32) -> i32x4 { + psrlid(a, imm8) } /// Shift packed 32-bit integers in `a` right by `count` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srl_epi32(a: i32x4, count: i32x4) -> i32x4 { - unsafe { psrld(a, count) } +#[cfg_attr(test, assert_instr(psrld))] +pub unsafe fn _mm_srl_epi32(a: i32x4, count: i32x4) -> i32x4 { + psrld(a, count) } /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srli_epi64(a: i64x2, imm8: i32) -> i64x2 { - unsafe { psrliq(a, imm8) } +#[cfg_attr(test, assert_instr(psrlq))] +pub unsafe fn _mm_srli_epi64(a: i64x2, imm8: i32) -> i64x2 { + psrliq(a, imm8) } /// Shift packed 64-bit integers in `a` right by `count` while shifting in /// zeros. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_srl_epi64(a: i64x2, count: i64x2) -> i64x2 { - unsafe { psrlq(a, count) } +#[cfg_attr(test, assert_instr(psrlq))] +pub unsafe fn _mm_srl_epi64(a: i64x2, count: i64x2) -> i64x2 { + psrlq(a, count) } /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and /// `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i { +#[cfg_attr(test, assert_instr(andps))] +pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i { a & b } @@ -501,7 +573,8 @@ pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i { /// then AND with `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i { +#[cfg_attr(test, assert_instr(andnps))] +pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i { (!a) & b } @@ -509,7 +582,8 @@ pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i { /// `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i { +#[cfg_attr(test, assert_instr(orps))] +pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i { a | b } @@ -517,70 +591,80 @@ pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i { /// `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i { +#[cfg_attr(test, assert_instr(xorps))] +pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i { a ^ b } /// Compare packed 8-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 { +#[cfg_attr(test, assert_instr(pcmpeqb))] +pub unsafe fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 { a.eq(b) } /// Compare packed 16-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(test, assert_instr(pcmpeqw))] +pub unsafe fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 { a.eq(b) } /// Compare packed 32-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 { +#[cfg_attr(test, assert_instr(pcmpeqd))] +pub unsafe fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 { a.eq(b) } /// Compare packed 8-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 { +#[cfg_attr(test, assert_instr(pcmpgtb))] +pub unsafe fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 { a.gt(b) } /// Compare packed 16-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(test, assert_instr(pcmpgtw))] +pub unsafe fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 { a.gt(b) } /// Compare packed 32-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 { +#[cfg_attr(test, assert_instr(pcmpgtd))] +pub unsafe fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 { a.gt(b) } /// Compare packed 8-bit integers in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 { +#[cfg_attr(test, assert_instr(pcmpgtb))] +pub unsafe fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 { a.lt(b) } /// Compare packed 16-bit integers in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(test, assert_instr(pcmpgtw))] +pub unsafe fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 { a.lt(b) } /// Compare packed 32-bit integers in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 { +#[cfg_attr(test, assert_instr(pcmpgtd))] +pub unsafe fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 { a.lt(b) } @@ -588,31 +672,37 @@ pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 { /// double-precision (64-bit) floating-point elements. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 { - unsafe { simd_cast::(simd_shuffle2(a, a, [0, 1])) } +#[cfg_attr(test, assert_instr(cvtdq2pd))] +pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 { + simd_cast::(simd_shuffle2(a, a, [0, 1])) } /// Return `a` with its lower element replaced by `b` after converting it to /// an `f64`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 { +#[cfg_attr(test, assert_instr(cvtsi2sd))] +pub unsafe fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 { a.replace(0, b as f64) } /// Return `a` with its lower element replaced by `b` after converting it to /// an `f64`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 { +#[cfg_attr(test, assert_instr(cvtsi2sd))] +pub unsafe fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 { a.replace(0, b as f64) } /// Return `a` with its lower element replaced by `b` after converting it to /// an `f64`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 { +#[cfg_attr(test, assert_instr(cvtsi2sd))] +pub unsafe fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 { _mm_cvtsi64_sd(a, b) } @@ -620,52 +710,63 @@ pub fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 { /// floating-point elements. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtepi32_ps(a: i32x4) -> f32x4 { - unsafe { cvtdq2ps(a) } +#[cfg_attr(test, assert_instr(cvtdq2ps))] +pub unsafe fn _mm_cvtepi32_ps(a: i32x4) -> f32x4 { + cvtdq2ps(a) } /// Return a vector whose lowest element is `a` and all higher elements are /// `0`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi32_si128(a: i32) -> i32x4 { +// no particular instruction to test +pub unsafe fn _mm_cvtsi32_si128(a: i32) -> i32x4 { i32x4::new(a, 0, 0, 0) } /// Return a vector whose lowest element is `a` and all higher elements are /// `0`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi64_si128(a: i64) -> i64x2 { +// no particular instruction to test +pub unsafe fn _mm_cvtsi64_si128(a: i64) -> i64x2 { i64x2::new(a, 0) } /// Return a vector whose lowest element is `a` and all higher elements are /// `0`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi64x_si128(a: i64) -> i64x2 { +// no particular instruction to test +pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> i64x2 { _mm_cvtsi64_si128(a) } /// Return the lowest element of `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi128_si32(a: i32x4) -> i32 { +// no particular instruction to test +pub unsafe fn _mm_cvtsi128_si32(a: i32x4) -> i32 { a.extract(0) } /// Return the lowest element of `a`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi128_si64(a: i64x2) -> i64 { +// no particular instruction to test +pub unsafe fn _mm_cvtsi128_si64(a: i64x2) -> i64 { a.extract(0) } /// Return the lowest element of `a`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cvtsi128_si64x(a: i64x2) -> i64 { +// no particular instruction to test +pub unsafe fn _mm_cvtsi128_si64x(a: i64x2) -> i64 { _mm_cvtsi128_si64(a) } @@ -673,21 +774,24 @@ pub fn _mm_cvtsi128_si64x(a: i64x2) -> i64 { /// lowest. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set_epi64x(e1: i64, e0: i64) -> i64x2 { +// no particular instruction to test +pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> i64x2 { i64x2::new(e0, e1) } /// Set packed 32-bit integers with the supplied values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 { +// no particular instruction to test +pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 { i32x4::new(e0, e1, e2, e3) } /// Set packed 16-bit integers with the supplied values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set_epi16( +// no particular instruction to test +pub unsafe fn _mm_set_epi16( e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> i16x8 { i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7) @@ -696,7 +800,8 @@ pub fn _mm_set_epi16( /// Set packed 8-bit integers with the supplied values. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set_epi8( +// no particular instruction to test +pub unsafe fn _mm_set_epi8( e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> i8x16 { @@ -708,42 +813,48 @@ pub fn _mm_set_epi8( /// Broadcast 64-bit integer `a` to all elements. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set1_epi64x(a: i64) -> i64x2 { +// no particular instruction to test +pub unsafe fn _mm_set1_epi64x(a: i64) -> i64x2 { i64x2::splat(a) } /// Broadcast 32-bit integer `a` to all elements. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set1_epi32(a: i32) -> i32x4 { +// no particular instruction to test +pub unsafe fn _mm_set1_epi32(a: i32) -> i32x4 { i32x4::splat(a) } /// Broadcast 16-bit integer `a` to all elements. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set1_epi16(a: i16) -> i16x8 { +// no particular instruction to test +pub unsafe fn _mm_set1_epi16(a: i16) -> i16x8 { i16x8::splat(a) } /// Broadcast 8-bit integer `a` to all elements. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_set1_epi8(a: i8) -> i8x16 { +// no particular instruction to test +pub unsafe fn _mm_set1_epi8(a: i8) -> i8x16 { i8x16::splat(a) } /// Set packed 32-bit integers with the supplied values in reverse order. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 { +// no particular instruction to test +pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 { i32x4::new(e3, e2, e1, e0) } /// Set packed 16-bit integers with the supplied values in reverse order. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_setr_epi16( +// no particular instruction to test +pub unsafe fn _mm_setr_epi16( e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> i16x8 { i16x8::new(e7, e6, e5, e4, e3, e2, e1, e0) @@ -752,7 +863,8 @@ pub fn _mm_setr_epi16( /// Set packed 8-bit integers with the supplied values in reverse order. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_setr_epi8( +// no particular instruction to test +pub unsafe fn _mm_setr_epi8( e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> i8x16 { @@ -764,13 +876,15 @@ pub fn _mm_setr_epi8( /// Returns a vector with all elements set to zero. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_setzero_si128() -> __m128i { +#[cfg_attr(test, assert_instr(xorps))] +pub unsafe fn _mm_setzero_si128() -> __m128i { __m128i::splat(0) } /// Load 64-bit integer from memory into first element of returned vector. #[inline(always)] #[target_feature = "+sse2"] +// no particular instruction to test pub unsafe fn _mm_loadl_epi64(mem_addr: *const i64x2) -> i64x2 { i64x2::new((*mem_addr).extract(0), 0) } @@ -780,6 +894,7 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const i64x2) -> i64x2 { /// `mem_addr` must be aligned on a 16-byte boundary. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i { *mem_addr } @@ -789,6 +904,7 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i { /// `mem_addr` does not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(movups))] pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { let mut dst = mem::uninitialized(); ptr::copy_nonoverlapping( @@ -808,6 +924,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { /// to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(maskmovdqu))] pub unsafe fn _mm_maskmoveu_si128(a: i8x16, mask: i8x16, mem_addr: *mut i8) { maskmovdqu(a, mask, mem_addr) } @@ -817,6 +934,7 @@ pub unsafe fn _mm_maskmoveu_si128(a: i8x16, mask: i8x16, mem_addr: *mut i8) { /// `mem_addr` must be aligned on a 16-byte boundary. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) { *mem_addr = a; } @@ -826,6 +944,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) { /// `mem_addr` does not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(movups))] pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { ptr::copy_nonoverlapping( &a as *const _ as *const u8, @@ -838,6 +957,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { /// `mem_addr` does not need to be aligned on any particular boundary. #[inline(always)] #[target_feature = "+sse2"] +// no particular instruction to test pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { ptr::copy_nonoverlapping( &a as *const _ as *const u8, mem_addr as *mut u8, 8); @@ -847,59 +967,78 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { /// element is zero. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_move_epi64(a: i64x2) -> i64x2 { - a.replace(1, 0) +// no particular instruction to test +pub unsafe fn _mm_move_epi64(a: i64x2) -> i64x2 { + simd_shuffle2(a, i64x2::splat(0), [0, 2]) } /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using signed saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_packs_epi16(a: i16x8, b: i16x8) -> i8x16 { - unsafe { packsswb(a, b) } +#[cfg_attr(test, assert_instr(packsswb))] +pub unsafe fn _mm_packs_epi16(a: i16x8, b: i16x8) -> i8x16 { + packsswb(a, b) } /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers /// using signed saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_packs_epi32(a: i32x4, b: i32x4) -> i16x8 { - unsafe { packssdw(a, b) } +#[cfg_attr(test, assert_instr(packssdw))] +pub unsafe fn _mm_packs_epi32(a: i32x4, b: i32x4) -> i16x8 { + packssdw(a, b) } /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using unsigned saturation. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_packus_epi16(a: i16x8, b: i16x8) -> u8x16 { - unsafe { packuswb(a, b) } +#[cfg_attr(test, assert_instr(packuswb))] +pub unsafe fn _mm_packus_epi16(a: i16x8, b: i16x8) -> u8x16 { + packuswb(a, b) } /// Return the `imm8` element of `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 { +pub unsafe fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 { a.extract(imm8 as u32 & 0b111) as i32 } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pextrw))] +fn _test_mm_extract_epi16(a: i16x8) -> i32 { + unsafe { _mm_extract_epi16(a, 9) } +} + /// Return a new vector where the `imm8` element of `a` is replaced with `i`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 { +pub unsafe fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 { a.replace(imm8 as u32 & 0b111, i as i16) } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pinsrw))] +fn _test_mm_insert_epi16(a: i16x8, i: i32) -> i16x8 { + unsafe { _mm_insert_epi16(a, i, 9) } +} + /// Return a mask of the most significant bit of each element in `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_movemask_epi8(a: i8x16) -> i32 { - unsafe { pmovmskb(a) } +#[cfg_attr(test, assert_instr(pmovmskb))] +pub unsafe fn _mm_movemask_epi8(a: i8x16) -> i32 { + pmovmskb(a) } /// Shuffle 32-bit integers in `a` using the control in `imm8`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { +pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { // simd_shuffleX requires that its selector parameter be made up of // constant values, but we can't enforce that here. In spirit, we need // to write a `match` on all possible values of a byte, and for each value, @@ -911,9 +1050,7 @@ pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { - unsafe { - simd_shuffle4(a, a, [$x01, $x23, $x45, $x67]) - } + simd_shuffle4(a, a, [$x01, $x23, $x45, $x67]) } } macro_rules! shuffle_x67 { @@ -954,6 +1091,13 @@ pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { } } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pshufd))] +fn _test_mm_shuffle_epi32(a: i32x4) -> i32x4 { + unsafe { _mm_shuffle_epi32(a, 9) } +} + /// Shuffle 16-bit integers in the high 64 bits of `a` using the control in /// `imm8`. /// @@ -961,18 +1105,16 @@ pub fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { /// bits being copied from from `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 { +pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 { // See _mm_shuffle_epi32. let imm8 = (imm8 & 0xFF) as u8; const fn add4(x: u32) -> u32 { x + 4 } macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { - unsafe { - simd_shuffle8(a, a, [ - 0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67), - ]) - } + simd_shuffle8(a, a, [ + 0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67), + ]) } } macro_rules! shuffle_x67 { @@ -1013,6 +1155,13 @@ pub fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 { } } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pshufhw))] +fn _test_mm_shufflehi_epi16(a: i16x8) -> i16x8 { + unsafe { _mm_shufflehi_epi16(a, 9) } +} + /// Shuffle 16-bit integers in the low 64 bits of `a` using the control in /// `imm8`. /// @@ -1020,15 +1169,13 @@ pub fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 { /// bits being copied from from `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 { +pub unsafe fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 { // See _mm_shuffle_epi32. let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { - unsafe { - simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7]) - } + simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7]) } } macro_rules! shuffle_x67 { @@ -1069,77 +1216,89 @@ pub fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 { } } +#[cfg(test)] +#[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(pshuflw))] +fn _test_mm_shufflelo_epi16(a: i16x8) -> i16x8 { + unsafe { _mm_shufflelo_epi16(a, 9) } +} + /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { - simd_shuffle16(a, b, [ - 8, 24, 9, 25, 10, 26, 11, 27, - 12, 28, 13, 29, 14, 30, 15, 31, - ]) - } +#[cfg_attr(test, assert_instr(punpckhbw))] +pub unsafe fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 { + simd_shuffle16(a, b, [ + 8, 24, 9, 25, 10, 26, 11, 27, + 12, 28, 13, 29, 14, 30, 15, 31, + ]) } /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpackhi_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) } +#[cfg_attr(test, assert_instr(punpckhwd))] +pub unsafe fn _mm_unpackhi_epi16(a: i16x8, b: i16x8) -> i16x8 { + simd_shuffle8(a, b, [4, 12, 5, 13, 6, 14, 7, 15]) } /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpackhi_epi32(a: i32x4, b: i32x4) -> i32x4 { - unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) } +#[cfg_attr(test, assert_instr(punpckhdq))] +pub unsafe fn _mm_unpackhi_epi32(a: i32x4, b: i32x4) -> i32x4 { + simd_shuffle4(a, b, [2, 6, 3, 7]) } /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 { - unsafe { simd_shuffle2(a, b, [1, 3]) } +#[cfg_attr(test, assert_instr(punpckhqdq))] +pub unsafe fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 { + simd_shuffle2(a, b, [1, 3]) } /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { - simd_shuffle16(a, b, [ - 0, 16, 1, 17, 2, 18, 3, 19, - 4, 20, 5, 21, 6, 22, 7, 23, - ]) - } +#[cfg_attr(test, assert_instr(punpcklbw))] +pub unsafe fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 { + simd_shuffle16(a, b, [ + 0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23, + ]) } /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpacklo_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) } +#[cfg_attr(test, assert_instr(punpcklwd))] +pub unsafe fn _mm_unpacklo_epi16(a: i16x8, b: i16x8) -> i16x8 { + simd_shuffle8(a, b, [0, 8, 1, 9, 2, 10, 3, 11]) } /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpacklo_epi32(a: i32x4, b: i32x4) -> i32x4 { - unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) } +#[cfg_attr(test, assert_instr(punpckldq))] +pub unsafe fn _mm_unpacklo_epi32(a: i32x4, b: i32x4) -> i32x4 { + simd_shuffle4(a, b, [0, 4, 1, 5]) } /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 { - unsafe { simd_shuffle2(a, b, [0, 2]) } +#[cfg_attr(test, assert_instr(punpcklqdq))] +pub unsafe fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 { + simd_shuffle2(a, b, [0, 2]) } /// Return a new vector with the low element of `a` replaced by the sum of the /// low elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(addsd))] +pub unsafe fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 { a.replace(0, a.extract(0) + b.extract(0)) } @@ -1147,7 +1306,8 @@ pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 { /// `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(addpd))] +pub unsafe fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 { a + b } @@ -1155,7 +1315,8 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 { /// diving the lower element of `a` by the lower element of `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(divsd))] +pub unsafe fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 { a.replace(0, a.extract(0) / b.extract(0)) } @@ -1163,7 +1324,8 @@ pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 { /// packed elements in `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(divpd))] +pub unsafe fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 { a / b } @@ -1171,39 +1333,44 @@ pub fn _mm_div_pd(a: f64x2, b: f64x2) -> f64x2 { /// of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_max_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { maxsd(a, b) } +#[cfg_attr(test, assert_instr(maxsd))] +pub unsafe fn _mm_max_sd(a: f64x2, b: f64x2) -> f64x2 { + maxsd(a, b) } /// Return a new vector with the maximum values from corresponding elements in /// `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_max_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { maxpd(a, b) } +#[cfg_attr(test, assert_instr(maxpd))] +pub unsafe fn _mm_max_pd(a: f64x2, b: f64x2) -> f64x2 { + maxpd(a, b) } /// Return a new vector with the low element of `a` replaced by the minimum /// of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_min_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { minsd(a, b) } +#[cfg_attr(test, assert_instr(minsd))] +pub unsafe fn _mm_min_sd(a: f64x2, b: f64x2) -> f64x2 { + minsd(a, b) } /// Return a new vector with the minimum values from corresponding elements in /// `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { minpd(a, b) } +#[cfg_attr(test, assert_instr(minpd))] +pub unsafe fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 { + minpd(a, b) } /// Return a new vector with the low element of `a` replaced by multiplying the /// low elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(mulsd))] +pub unsafe fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 { a.replace(0, a.extract(0) * b.extract(0)) } @@ -1211,7 +1378,8 @@ pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 { /// and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(mulpd))] +pub unsafe fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 { a * b } @@ -1219,22 +1387,25 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 { /// root of the lower element `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 { - a.replace(0, unsafe { sqrtsd(b).extract(0) }) +#[cfg_attr(test, assert_instr(sqrtsd))] +pub unsafe fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 { + a.replace(0, sqrtsd(b).extract(0)) } /// Return a new vector with the square root of each of the values in `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 { - unsafe { sqrtpd(a) } +#[cfg_attr(test, assert_instr(sqrtpd))] +pub unsafe fn _mm_sqrt_pd(a: f64x2) -> f64x2 { + sqrtpd(a) } /// Return a new vector with the low element of `a` replaced by subtracting the /// low element by `b` from the low element of `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(subsd))] +pub unsafe fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 { a.replace(0, a.extract(0) - b.extract(0)) } @@ -1242,7 +1413,8 @@ pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 { /// from `a`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(subpd))] +pub unsafe fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 { a - b } @@ -1250,76 +1422,76 @@ pub fn _mm_sub_pd(a: f64x2, b: f64x2) -> f64x2 { /// elements in `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_and_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let a: i64x2 = mem::transmute(a); - let b: i64x2 = mem::transmute(b); - mem::transmute(a & b) - } +#[cfg_attr(test, assert_instr(andps))] +pub unsafe fn _mm_and_pd(a: f64x2, b: f64x2) -> f64x2 { + let a: i64x2 = mem::transmute(a); + let b: i64x2 = mem::transmute(b); + mem::transmute(a & b) } /// Compute the bitwise NOT of `a` and then AND with `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_andnot_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let a: i64x2 = mem::transmute(a); - let b: i64x2 = mem::transmute(b); - mem::transmute((!a) & b) - } +#[cfg_attr(test, assert_instr(andnps))] +pub unsafe fn _mm_andnot_pd(a: f64x2, b: f64x2) -> f64x2 { + let a: i64x2 = mem::transmute(a); + let b: i64x2 = mem::transmute(b); + mem::transmute((!a) & b) } /// Compute the bitwise OR of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_or_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let a: i64x2 = mem::transmute(a); - let b: i64x2 = mem::transmute(b); - mem::transmute(a | b) - } +#[cfg_attr(test, assert_instr(orps))] +pub unsafe fn _mm_or_pd(a: f64x2, b: f64x2) -> f64x2 { + let a: i64x2 = mem::transmute(a); + let b: i64x2 = mem::transmute(b); + mem::transmute(a | b) } /// Compute the bitwise OR of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_xor_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let a: i64x2 = mem::transmute(a); - let b: i64x2 = mem::transmute(b); - mem::transmute(a ^ b) - } +#[cfg_attr(test, assert_instr(xorps))] +pub unsafe fn _mm_xor_pd(a: f64x2, b: f64x2) -> f64x2 { + let a: i64x2 = mem::transmute(a); + let b: i64x2 = mem::transmute(b); + mem::transmute(a ^ b) } /// Return a new vector with the low element of `a` replaced by the equality /// comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpeq_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 0) } +#[cfg_attr(test, assert_instr(cmpeqsd))] +pub unsafe fn _mm_cmpeq_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 0) } /// Return a new vector with the low element of `a` replaced by the less-than /// comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmplt_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 1) } +#[cfg_attr(test, assert_instr(cmpltsd))] +pub unsafe fn _mm_cmplt_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 1) } /// Return a new vector with the low element of `a` replaced by the /// less-than-or-equal comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 2) } +#[cfg_attr(test, assert_instr(cmplesd))] +pub unsafe fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 2) } /// Return a new vector with the low element of `a` replaced by the /// greater-than comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmpltsd))] +pub unsafe fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmplt_sd(b, a).replace(1, a.extract(1)) } @@ -1327,7 +1499,8 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 { /// greater-than-or-equal comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmplesd))] +pub unsafe fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmple_sd(b, a).replace(1, a.extract(1)) } @@ -1337,8 +1510,9 @@ pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 { /// otherwise. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 7) } +#[cfg_attr(test, assert_instr(cmpordsd))] +pub unsafe fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 7) } /// Return a new vector with the low element of `a` replaced by the result of @@ -1346,39 +1520,44 @@ pub fn _mm_cmpord_sd(a: f64x2, b: f64x2) -> f64x2 { /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpunord_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 3) } +#[cfg_attr(test, assert_instr(cmpunordsd))] +pub unsafe fn _mm_cmpunord_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 3) } /// Return a new vector with the low element of `a` replaced by the not-equal /// comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpneq_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 4) } +#[cfg_attr(test, assert_instr(cmpneqsd))] +pub unsafe fn _mm_cmpneq_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 4) } /// Return a new vector with the low element of `a` replaced by the /// not-less-than comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpnlt_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 5) } +#[cfg_attr(test, assert_instr(cmpnltsd))] +pub unsafe fn _mm_cmpnlt_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 5) } /// Return a new vector with the low element of `a` replaced by the /// not-less-than-or-equal comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmpsd(a, b, 6) } +#[cfg_attr(test, assert_instr(cmpnlesd))] +pub unsafe fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 { + cmpsd(a, b, 6) } /// Return a new vector with the low element of `a` replaced by the /// not-greater-than comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmpnltsd))] +pub unsafe fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmpnlt_sd(b, a).replace(1, a.extract(1)) } @@ -1386,84 +1565,96 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 { /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmpnlesd))] +pub unsafe fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmpnle_sd(b, a).replace(1, a.extract(1)) } /// Compare corresponding elements in `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpeq_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 0) } +#[cfg_attr(test, assert_instr(cmpeqpd))] +pub unsafe fn _mm_cmpeq_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 0) } /// Compare corresponding elements in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmplt_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 1) } +#[cfg_attr(test, assert_instr(cmpltpd))] +pub unsafe fn _mm_cmplt_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 1) } /// Compare corresponding elements in `a` and `b` for less-than-or-equal #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmple_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 2) } +#[cfg_attr(test, assert_instr(cmplepd))] +pub unsafe fn _mm_cmple_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 2) } /// Compare corresponding elements in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpgt_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmpltpd))] +pub unsafe fn _mm_cmpgt_pd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmplt_pd(b, a) } /// Compare corresponding elements in `a` and `b` for greater-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpge_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmplepd))] +pub unsafe fn _mm_cmpge_pd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmple_pd(b, a) } /// Compare corresponding elements in `a` and `b` to see if neither is `NaN`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpord_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 7) } +#[cfg_attr(test, assert_instr(cmpordpd))] +pub unsafe fn _mm_cmpord_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 7) } /// Compare corresponding elements in `a` and `b` to see if either is `NaN`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpunord_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 3) } +#[cfg_attr(test, assert_instr(cmpunordpd))] +pub unsafe fn _mm_cmpunord_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 3) } /// Compare corresponding elements in `a` and `b` for not-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpneq_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 4) } +#[cfg_attr(test, assert_instr(cmpneqpd))] +pub unsafe fn _mm_cmpneq_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 4) } /// Compare corresponding elements in `a` and `b` for not-less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpnlt_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 5) } +#[cfg_attr(test, assert_instr(cmpnltpd))] +pub unsafe fn _mm_cmpnlt_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 5) } /// Compare corresponding elements in `a` and `b` for not-less-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpnle_pd(a: f64x2, b: f64x2) -> f64x2 { - unsafe { cmppd(a, b, 6) } +#[cfg_attr(test, assert_instr(cmpnlepd))] +pub unsafe fn _mm_cmpnle_pd(a: f64x2, b: f64x2) -> f64x2 { + cmppd(a, b, 6) } /// Compare corresponding elements in `a` and `b` for not-greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmpnltpd))] +pub unsafe fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmpnlt_pd(b, a) } @@ -1471,92 +1662,105 @@ pub fn _mm_cmpngt_pd(a: f64x2, b: f64x2) -> f64x2 { /// not-greater-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 { +#[cfg_attr(test, assert_instr(cmpnlepd))] +pub unsafe fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 { _mm_cmpnle_pd(b, a) } /// Compare the lower element of `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(comieqsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(comisd))] +pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(comieqsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(comiltsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(comisd))] +pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(comiltsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for less-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(comilesd(a, b) as u8) } +#[cfg_attr(test, assert_instr(comisd))] +pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(comilesd(a, b) as u8) } /// Compare the lower element of `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(comigtsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(comisd))] +pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(comigtsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for greater-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(comigesd(a, b) as u8) } +#[cfg_attr(test, assert_instr(comisd))] +pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(comigesd(a, b) as u8) } /// Compare the lower element of `a` and `b` for not-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(comineqsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(comisd))] +pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(comineqsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(ucomieqsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(ucomisd))] +pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(ucomieqsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(ucomiltsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(ucomisd))] +pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(ucomiltsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for less-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(ucomilesd(a, b) as u8) } +#[cfg_attr(test, assert_instr(ucomisd))] +pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(ucomilesd(a, b) as u8) } /// Compare the lower element of `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(ucomigtsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(ucomisd))] +pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(ucomigtsd(a, b) as u8) } /// Compare the lower element of `a` and `b` for greater-than-or-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(ucomigesd(a, b) as u8) } +#[cfg_attr(test, assert_instr(ucomisd))] +pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(ucomigesd(a, b) as u8) } /// Compare the lower element of `a` and `b` for not-equal. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool { - unsafe { mem::transmute(ucomineqsd(a, b) as u8) } +#[cfg_attr(test, assert_instr(ucomisd))] +pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool { + mem::transmute(ucomineqsd(a, b) as u8) } /// Return a mask of the most significant bit of each element in `a`. @@ -1565,8 +1769,9 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool { /// All other bits are set to `0`. #[inline(always)] #[target_feature = "+sse2"] -pub fn _mm_movemask_pd(a: f64x2) -> i32 { - unsafe { movmskpd(a) } +#[cfg_attr(test, assert_instr(movmskpd))] +pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 { + movmskpd(a) } @@ -1574,12 +1779,14 @@ pub fn _mm_movemask_pd(a: f64x2) -> i32 { #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 { *(mem_addr as *const f64x2) } #[inline(always)] #[target_feature = "+sse2"] +#[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: f64x2) { *(mem_addr as *mut f64x2) = a; } @@ -1730,7 +1937,7 @@ mod tests { #[simd_test = "sse2"] fn _mm_pause() { - sse2::_mm_pause(); + unsafe { sse2::_mm_pause() }; } #[simd_test = "sse2"] @@ -1741,12 +1948,12 @@ mod tests { #[simd_test = "sse2"] fn _mm_lfence() { - sse2::_mm_lfence(); + unsafe { sse2::_mm_lfence() }; } #[simd_test = "sse2"] fn _mm_mfence() { - sse2::_mm_mfence(); + unsafe { sse2::_mm_mfence() }; } #[simd_test = "sse2"] @@ -1755,7 +1962,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i8x16::new( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - let r = sse2::_mm_add_epi8(a, b); + let r = unsafe { sse2::_mm_add_epi8(a, b) }; let e = i8x16::new( 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46); assert_eq!(r, e); @@ -1765,7 +1972,7 @@ mod tests { fn _mm_add_epi8_overflow() { let a = i8x16::splat(0x7F); let b = i8x16::splat(1); - let r = sse2::_mm_add_epi8(a, b); + let r = unsafe { sse2::_mm_add_epi8(a, b) }; assert_eq!(r, i8x16::splat(-128)); } @@ -1773,7 +1980,7 @@ mod tests { fn _mm_add_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15); - let r = sse2::_mm_add_epi16(a, b); + let r = unsafe { sse2::_mm_add_epi16(a, b) }; let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22); assert_eq!(r, e); } @@ -1782,7 +1989,7 @@ mod tests { fn _mm_add_epi32() { let a = i32x4::new(0, 1, 2, 3); let b = i32x4::new(4, 5, 6, 7); - let r = sse2::_mm_add_epi32(a, b); + let r = unsafe { sse2::_mm_add_epi32(a, b) }; let e = i32x4::new(4, 6, 8, 10); assert_eq!(r, e); } @@ -1791,7 +1998,7 @@ mod tests { fn _mm_add_epi64() { let a = i64x2::new(0, 1); let b = i64x2::new(2, 3); - let r = sse2::_mm_add_epi64(a, b); + let r = unsafe { sse2::_mm_add_epi64(a, b) }; let e = i64x2::new(2, 4); assert_eq!(r, e); } @@ -1802,7 +2009,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i8x16::new( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - let r = sse2::_mm_adds_epi8(a, b); + let r = unsafe { sse2::_mm_adds_epi8(a, b) }; let e = i8x16::new( 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46); assert_eq!(r, e); @@ -1812,7 +2019,7 @@ mod tests { fn _mm_adds_epi8_saturate_positive() { let a = i8x16::splat(0x7F); let b = i8x16::splat(1); - let r = sse2::_mm_adds_epi8(a, b); + let r = unsafe { sse2::_mm_adds_epi8(a, b) }; assert_eq!(r, a); } @@ -1820,7 +2027,7 @@ mod tests { fn _mm_adds_epi8_saturate_negative() { let a = i8x16::splat(-0x80); let b = i8x16::splat(-1); - let r = sse2::_mm_adds_epi8(a, b); + let r = unsafe { sse2::_mm_adds_epi8(a, b) }; assert_eq!(r, a); } @@ -1828,7 +2035,7 @@ mod tests { fn _mm_adds_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15); - let r = sse2::_mm_adds_epi16(a, b); + let r = unsafe { sse2::_mm_adds_epi16(a, b) }; let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22); assert_eq!(r, e); } @@ -1837,7 +2044,7 @@ mod tests { fn _mm_adds_epi16_saturate_positive() { let a = i16x8::splat(0x7FFF); let b = i16x8::splat(1); - let r = sse2::_mm_adds_epi16(a, b); + let r = unsafe { sse2::_mm_adds_epi16(a, b) }; assert_eq!(r, a); } @@ -1845,7 +2052,7 @@ mod tests { fn _mm_adds_epi16_saturate_negative() { let a = i16x8::splat(-0x8000); let b = i16x8::splat(-1); - let r = sse2::_mm_adds_epi16(a, b); + let r = unsafe { sse2::_mm_adds_epi16(a, b) }; assert_eq!(r, a); } @@ -1855,7 +2062,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = u8x16::new( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - let r = sse2::_mm_adds_epu8(a, b); + let r = unsafe { sse2::_mm_adds_epu8(a, b) }; let e = u8x16::new( 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46); assert_eq!(r, e); @@ -1865,7 +2072,7 @@ mod tests { fn _mm_adds_epu8_saturate() { let a = u8x16::splat(0xFF); let b = u8x16::splat(1); - let r = sse2::_mm_adds_epu8(a, b); + let r = unsafe { sse2::_mm_adds_epu8(a, b) }; assert_eq!(r, a); } @@ -1873,7 +2080,7 @@ mod tests { fn _mm_adds_epu16() { let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15); - let r = sse2::_mm_adds_epu16(a, b); + let r = unsafe { sse2::_mm_adds_epu16(a, b) }; let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22); assert_eq!(r, e); } @@ -1882,21 +2089,21 @@ mod tests { fn _mm_adds_epu16_saturate() { let a = u16x8::splat(0xFFFF); let b = u16x8::splat(1); - let r = sse2::_mm_adds_epu16(a, b); + let r = unsafe { sse2::_mm_adds_epu16(a, b) }; assert_eq!(r, a); } #[simd_test = "sse2"] fn _mm_avg_epu8() { let (a, b) = (u8x16::splat(3), u8x16::splat(9)); - let r = sse2::_mm_avg_epu8(a, b); + let r = unsafe { sse2::_mm_avg_epu8(a, b) }; assert_eq!(r, u8x16::splat(6)); } #[simd_test = "sse2"] fn _mm_avg_epu16() { let (a, b) = (u16x8::splat(3), u16x8::splat(9)); - let r = sse2::_mm_avg_epu16(a, b); + let r = unsafe { sse2::_mm_avg_epu16(a, b) }; assert_eq!(r, u16x8::splat(6)); } @@ -1904,7 +2111,7 @@ mod tests { fn _mm_madd_epi16() { let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let b = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_madd_epi16(a, b); + let r = unsafe { sse2::_mm_madd_epi16(a, b) }; let e = i32x4::new(29, 81, 149, 233); assert_eq!(r, e); } @@ -1913,7 +2120,7 @@ mod tests { fn _mm_max_epi16() { let a = i16x8::splat(1); let b = i16x8::splat(-1); - let r = sse2::_mm_max_epi16(a, b); + let r = unsafe { sse2::_mm_max_epi16(a, b) }; assert_eq!(r, a); } @@ -1921,7 +2128,7 @@ mod tests { fn _mm_max_epu8() { let a = u8x16::splat(1); let b = u8x16::splat(255); - let r = sse2::_mm_max_epu8(a, b); + let r = unsafe { sse2::_mm_max_epu8(a, b) }; assert_eq!(r, b); } @@ -1929,7 +2136,7 @@ mod tests { fn _mm_min_epi16() { let a = i16x8::splat(1); let b = i16x8::splat(-1); - let r = sse2::_mm_min_epi16(a, b); + let r = unsafe { sse2::_mm_min_epi16(a, b) }; assert_eq!(r, b); } @@ -1937,28 +2144,28 @@ mod tests { fn _mm_min_epu8() { let a = u8x16::splat(1); let b = u8x16::splat(255); - let r = sse2::_mm_min_epu8(a, b); + let r = unsafe { sse2::_mm_min_epu8(a, b) }; assert_eq!(r, a); } #[simd_test = "sse2"] fn _mm_mulhi_epi16() { let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001)); - let r = sse2::_mm_mulhi_epi16(a, b); + let r = unsafe { sse2::_mm_mulhi_epi16(a, b) }; assert_eq!(r, i16x8::splat(-16)); } #[simd_test = "sse2"] fn _mm_mulhi_epu16() { let (a, b) = (u16x8::splat(1000), u16x8::splat(1001)); - let r = sse2::_mm_mulhi_epu16(a, b); + let r = unsafe { sse2::_mm_mulhi_epu16(a, b) }; assert_eq!(r, u16x8::splat(15)); } #[simd_test = "sse2"] fn _mm_mullo_epi16() { let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001)); - let r = sse2::_mm_mullo_epi16(a, b); + let r = unsafe { sse2::_mm_mullo_epi16(a, b) }; assert_eq!(r, i16x8::splat(-17960)); } @@ -1966,7 +2173,7 @@ mod tests { fn _mm_mul_epu32() { let a = u32x4::from(u64x2::new(1_000_000_000, 1 << 34)); let b = u32x4::from(u64x2::new(1_000_000_000, 1 << 35)); - let r = sse2::_mm_mul_epu32(a, b); + let r = unsafe { sse2::_mm_mul_epu32(a, b) }; let e = u64x2::new(1_000_000_000 * 1_000_000_000, 0); assert_eq!(r, e); } @@ -1979,7 +2186,7 @@ mod tests { let b = u8x16::new( 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2); - let r = sse2::_mm_sad_epu8(a, b); + let r = unsafe { sse2::_mm_sad_epu8(a, b) }; let e = u64x2::new(1020, 614); assert_eq!(r, e); } @@ -1987,35 +2194,35 @@ mod tests { #[simd_test = "sse2"] fn _mm_sub_epi8() { let (a, b) = (i8x16::splat(5), i8x16::splat(6)); - let r = sse2::_mm_sub_epi8(a, b); + let r = unsafe { sse2::_mm_sub_epi8(a, b) }; assert_eq!(r, i8x16::splat(-1)); } #[simd_test = "sse2"] fn _mm_sub_epi16() { let (a, b) = (i16x8::splat(5), i16x8::splat(6)); - let r = sse2::_mm_sub_epi16(a, b); + let r = unsafe { sse2::_mm_sub_epi16(a, b) }; assert_eq!(r, i16x8::splat(-1)); } #[simd_test = "sse2"] fn _mm_sub_epi32() { let (a, b) = (i32x4::splat(5), i32x4::splat(6)); - let r = sse2::_mm_sub_epi32(a, b); + let r = unsafe { sse2::_mm_sub_epi32(a, b) }; assert_eq!(r, i32x4::splat(-1)); } #[simd_test = "sse2"] fn _mm_sub_epi64() { let (a, b) = (i64x2::splat(5), i64x2::splat(6)); - let r = sse2::_mm_sub_epi64(a, b); + let r = unsafe { sse2::_mm_sub_epi64(a, b) }; assert_eq!(r, i64x2::splat(-1)); } #[simd_test = "sse2"] fn _mm_subs_epi8() { let (a, b) = (i8x16::splat(5), i8x16::splat(2)); - let r = sse2::_mm_subs_epi8(a, b); + let r = unsafe { sse2::_mm_subs_epi8(a, b) }; assert_eq!(r, i8x16::splat(3)); } @@ -2023,7 +2230,7 @@ mod tests { fn _mm_subs_epi8_saturate_positive() { let a = i8x16::splat(0x7F); let b = i8x16::splat(-1); - let r = sse2::_mm_subs_epi8(a, b); + let r = unsafe { sse2::_mm_subs_epi8(a, b) }; assert_eq!(r, a); } @@ -2031,14 +2238,14 @@ mod tests { fn _mm_subs_epi8_saturate_negative() { let a = i8x16::splat(-0x80); let b = i8x16::splat(1); - let r = sse2::_mm_subs_epi8(a, b); + let r = unsafe { sse2::_mm_subs_epi8(a, b) }; assert_eq!(r, a); } #[simd_test = "sse2"] fn _mm_subs_epi16() { let (a, b) = (i16x8::splat(5), i16x8::splat(2)); - let r = sse2::_mm_subs_epi16(a, b); + let r = unsafe { sse2::_mm_subs_epi16(a, b) }; assert_eq!(r, i16x8::splat(3)); } @@ -2046,7 +2253,7 @@ mod tests { fn _mm_subs_epi16_saturate_positive() { let a = i16x8::splat(0x7FFF); let b = i16x8::splat(-1); - let r = sse2::_mm_subs_epi16(a, b); + let r = unsafe { sse2::_mm_subs_epi16(a, b) }; assert_eq!(r, a); } @@ -2054,14 +2261,14 @@ mod tests { fn _mm_subs_epi16_saturate_negative() { let a = i16x8::splat(-0x8000); let b = i16x8::splat(1); - let r = sse2::_mm_subs_epi16(a, b); + let r = unsafe { sse2::_mm_subs_epi16(a, b) }; assert_eq!(r, a); } #[simd_test = "sse2"] fn _mm_subs_epu8() { let (a, b) = (u8x16::splat(5), u8x16::splat(2)); - let r = sse2::_mm_subs_epu8(a, b); + let r = unsafe { sse2::_mm_subs_epu8(a, b) }; assert_eq!(r, u8x16::splat(3)); } @@ -2069,14 +2276,14 @@ mod tests { fn _mm_subs_epu8_saturate() { let a = u8x16::splat(0); let b = u8x16::splat(1); - let r = sse2::_mm_subs_epu8(a, b); + let r = unsafe { sse2::_mm_subs_epu8(a, b) }; assert_eq!(r, a); } #[simd_test = "sse2"] fn _mm_subs_epu16() { let (a, b) = (u16x8::splat(5), u16x8::splat(2)); - let r = sse2::_mm_subs_epu16(a, b); + let r = unsafe { sse2::_mm_subs_epu16(a, b) }; assert_eq!(r, u16x8::splat(3)); } @@ -2084,7 +2291,7 @@ mod tests { fn _mm_subs_epu16_saturate() { let a = u16x8::splat(0); let b = u16x8::splat(1); - let r = sse2::_mm_subs_epu16(a, b); + let r = unsafe { sse2::_mm_subs_epu16(a, b) }; assert_eq!(r, a); } @@ -2092,31 +2299,31 @@ mod tests { fn _mm_slli_si128() { let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_slli_si128(a, 1); + let r = unsafe { sse2::_mm_slli_si128(a, 1) }; let e = __m128i::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq!(r, e); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_slli_si128(a, 15); + let r = unsafe { sse2::_mm_slli_si128(a, 15) }; let e = __m128i::new( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); assert_eq!(r, e); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_slli_si128(a, 16); + let r = unsafe { sse2::_mm_slli_si128(a, 16) }; assert_eq!(r, __m128i::splat(0)); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_slli_si128(a, -1); + let r = unsafe { sse2::_mm_slli_si128(a, -1) }; assert_eq!(r, __m128i::splat(0)); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_slli_si128(a, -0x80000000); + let r = unsafe { sse2::_mm_slli_si128(a, -0x80000000) }; assert_eq!(r, __m128i::splat(0)); } @@ -2124,7 +2331,7 @@ mod tests { fn _mm_slli_epi16() { let a = i16x8::new( 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0); - let r = sse2::_mm_slli_epi16(a, 4); + let r = unsafe { sse2::_mm_slli_epi16(a, 4) }; let e = i16x8::new( 0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, 0, 0, 0, 0); @@ -2134,98 +2341,101 @@ mod tests { #[simd_test = "sse2"] fn _mm_sll_epi16() { let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0); - let r = sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0)); + let r = unsafe { + sse2::_mm_sll_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0)) + }; assert_eq!(r, i16x8::new(0xFF0, 0, 0, 0, 0, 0, 0, 0)); - let r = sse2::_mm_sll_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0)); + let r = unsafe { + sse2::_mm_sll_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0)) + }; assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0)); } #[simd_test = "sse2"] fn _mm_slli_epi32() { - assert_eq!( - sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4), - i32x4::splat(0xFFFF0)); + let r = unsafe { sse2::_mm_slli_epi32(i32x4::splat(0xFFFF), 4) }; + assert_eq!(r, i32x4::splat(0xFFFF0)); } #[simd_test = "sse2"] fn _mm_sll_epi32() { - assert_eq!( - sse2::_mm_sll_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)), - i32x4::splat(0xFFFF0)); + let a = i32x4::splat(0xFFFF); + let b = i32x4::new(4, 0, 0, 0); + let r = unsafe { sse2::_mm_sll_epi32(a, b) }; + assert_eq!(r, i32x4::splat(0xFFFF0)); } #[simd_test = "sse2"] fn _mm_slli_epi64() { - assert_eq!( - sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4), - i64x2::splat(0xFFFFFFFF0)); + let r = unsafe { sse2::_mm_slli_epi64(i64x2::splat(0xFFFFFFFF), 4) }; + assert_eq!(r, i64x2::splat(0xFFFFFFFF0)); } #[simd_test = "sse2"] fn _mm_sll_epi64() { - assert_eq!( - sse2::_mm_sll_epi64( - i64x2::splat(0xFFFFFFFF), i64x2::new(4, 0)), - i64x2::splat(0xFFFFFFFF0)); + let a = i64x2::splat(0xFFFFFFFF); + let b = i64x2::new(4, 0); + let r = unsafe { sse2::_mm_sll_epi64(a, b) }; + assert_eq!(r, i64x2::splat(0xFFFFFFFF0)); } #[simd_test = "sse2"] fn _mm_srai_epi16() { - assert_eq!( - sse2::_mm_srai_epi16(i16x8::splat(-1), 1), i16x8::splat(-1)); + let r = unsafe { sse2::_mm_srai_epi16(i16x8::splat(-1), 1) }; + assert_eq!(r, i16x8::splat(-1)); } #[simd_test = "sse2"] fn _mm_sra_epi16() { - assert_eq!( - sse2::_mm_sra_epi16( - i16x8::splat(-1), i16x8::new(1, 0, 0, 0, 0, 0, 0, 0)), - i16x8::splat(-1)); + let a = i16x8::splat(-1); + let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0); + let r = unsafe { sse2::_mm_sra_epi16(a, b) }; + assert_eq!(r, i16x8::splat(-1)); } #[simd_test = "sse2"] fn _mm_srai_epi32() { - assert_eq!( - sse2::_mm_srai_epi32(i32x4::splat(-1), 1), i32x4::splat(-1)); + let r = unsafe { sse2::_mm_srai_epi32(i32x4::splat(-1), 1) }; + assert_eq!(r, i32x4::splat(-1)); } #[simd_test = "sse2"] fn _mm_sra_epi32() { - assert_eq!( - sse2::_mm_sra_epi32( - i32x4::splat(-1), i32x4::new(1, 0, 0, 0)), - i32x4::splat(-1)); + let a = i32x4::splat(-1); + let b = i32x4::new(1, 0, 0, 0); + let r = unsafe { sse2::_mm_sra_epi32(a, b) }; + assert_eq!(r, i32x4::splat(-1)); } #[simd_test = "sse2"] fn _mm_srli_si128() { let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_srli_si128(a, 1); + let r = unsafe { sse2::_mm_srli_si128(a, 1) }; let e = __m128i::new( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0); assert_eq!(r, e); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_srli_si128(a, 15); + let r = unsafe { sse2::_mm_srli_si128(a, 15) }; let e = __m128i::new( 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq!(r, e); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_srli_si128(a, 16); + let r = unsafe { sse2::_mm_srli_si128(a, 16) }; assert_eq!(r, __m128i::splat(0)); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_srli_si128(a, -1); + let r = unsafe { sse2::_mm_srli_si128(a, -1) }; assert_eq!(r, __m128i::splat(0)); let a = __m128i::new( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let r = sse2::_mm_srli_si128(a, -0x80000000); + let r = unsafe { sse2::_mm_srli_si128(a, -0x80000000) }; assert_eq!(r, __m128i::splat(0)); } @@ -2233,7 +2443,7 @@ mod tests { fn _mm_srli_epi16() { let a = i16x8::new( 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0); - let r = sse2::_mm_srli_epi16(a, 4); + let r = unsafe { sse2::_mm_srli_epi16(a, 4) }; let e = i16x8::new( 0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0); @@ -2243,67 +2453,74 @@ mod tests { #[simd_test = "sse2"] fn _mm_srl_epi16() { let a = i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0); - let r = sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0)); + let r = unsafe { + sse2::_mm_srl_epi16(a, i16x8::new(4, 0, 0, 0, 0, 0, 0, 0)) + }; assert_eq!(r, i16x8::new(0xF, 0, 0, 0, 0, 0, 0, 0)); - let r = sse2::_mm_srl_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0)); + let r = unsafe { + sse2::_mm_srl_epi16(a, i16x8::new(0, 0, 0, 0, 4, 0, 0, 0)) + }; assert_eq!(r, i16x8::new(0xFF, 0, 0, 0, 0, 0, 0, 0)); } #[simd_test = "sse2"] fn _mm_srli_epi32() { - assert_eq!( - sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4), - i32x4::splat(0xFFF)); + let r = unsafe { sse2::_mm_srli_epi32(i32x4::splat(0xFFFF), 4) }; + assert_eq!(r, i32x4::splat(0xFFF)); } #[simd_test = "sse2"] fn _mm_srl_epi32() { - assert_eq!( - sse2::_mm_srl_epi32(i32x4::splat(0xFFFF), i32x4::new(4, 0, 0, 0)), - i32x4::splat(0xFFF)); + let a = i32x4::splat(0xFFFF); + let b = i32x4::new(4, 0, 0, 0); + let r = unsafe { sse2::_mm_srl_epi32(a, b) }; + assert_eq!(r, i32x4::splat(0xFFF)); } #[simd_test = "sse2"] fn _mm_srli_epi64() { - assert_eq!( - sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4), - i64x2::splat(0xFFFFFFF)); + let r = unsafe { sse2::_mm_srli_epi64(i64x2::splat(0xFFFFFFFF), 4) }; + assert_eq!(r, i64x2::splat(0xFFFFFFF)); } #[simd_test = "sse2"] fn _mm_srl_epi64() { - assert_eq!( - sse2::_mm_srl_epi64( - i64x2::splat(0xFFFFFFFF), i64x2::new(4, 0)), - i64x2::splat(0xFFFFFFF)); + let a = i64x2::splat(0xFFFFFFFF); + let b = i64x2::new(4, 0); + let r = unsafe { sse2::_mm_srl_epi64(a, b) }; + assert_eq!(r, i64x2::splat(0xFFFFFFF)); } #[simd_test = "sse2"] fn _mm_and_si128() { - assert_eq!( - sse2::_mm_and_si128(__m128i::splat(5), __m128i::splat(3)), - __m128i::splat(1)); + let a = __m128i::splat(5); + let b = __m128i::splat(3); + let r = unsafe { sse2::_mm_and_si128(a, b) }; + assert_eq!(r, __m128i::splat(1)); } #[simd_test = "sse2"] fn _mm_andnot_si128() { - assert_eq!( - sse2::_mm_andnot_si128(__m128i::splat(5), __m128i::splat(3)), - __m128i::splat(2)); + let a = __m128i::splat(5); + let b = __m128i::splat(3); + let r = unsafe { sse2::_mm_andnot_si128(a, b) }; + assert_eq!(r, __m128i::splat(2)); } #[simd_test = "sse2"] fn _mm_or_si128() { - assert_eq!( - sse2::_mm_or_si128(__m128i::splat(5), __m128i::splat(3)), - __m128i::splat(7)); + let a = __m128i::splat(5); + let b = __m128i::splat(3); + let r = unsafe { sse2::_mm_or_si128(a, b) }; + assert_eq!(r, __m128i::splat(7)); } #[simd_test = "sse2"] fn _mm_xor_si128() { - assert_eq!( - sse2::_mm_xor_si128(__m128i::splat(5), __m128i::splat(3)), - __m128i::splat(6)); + let a = __m128i::splat(5); + let b = __m128i::splat(3); + let r = unsafe { sse2::_mm_xor_si128(a, b) }; + assert_eq!(r, __m128i::splat(6)); } #[simd_test = "sse2"] @@ -2312,7 +2529,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i8x16::new( 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let r = sse2::_mm_cmpeq_epi8(a, b); + let r = unsafe { sse2::_mm_cmpeq_epi8(a, b) }; assert_eq!(r, i8x16::new( 0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); } @@ -2321,7 +2538,7 @@ mod tests { fn _mm_cmpeq_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0); - let r = sse2::_mm_cmpeq_epi16(a, b); + let r = unsafe { sse2::_mm_cmpeq_epi16(a, b) }; assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16)); } @@ -2329,7 +2546,7 @@ mod tests { fn _mm_cmpeq_epi32() { let a = i32x4::new(0, 1, 2, 3); let b = i32x4::new(3, 2, 2, 0); - let r = sse2::_mm_cmpeq_epi32(a, b); + let r = unsafe { sse2::_mm_cmpeq_epi32(a, b) }; assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); } @@ -2337,7 +2554,7 @@ mod tests { fn _mm_cmpgt_epi8() { let a = i8x16::splat(0).replace(0, 5); let b = i8x16::splat(0); - let r = sse2::_mm_cmpgt_epi8(a, b); + let r = unsafe { sse2::_mm_cmpgt_epi8(a, b) }; assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8)); } @@ -2345,7 +2562,7 @@ mod tests { fn _mm_cmpgt_epi16() { let a = i16x8::splat(0).replace(0, 5); let b = i16x8::splat(0); - let r = sse2::_mm_cmpgt_epi16(a, b); + let r = unsafe { sse2::_mm_cmpgt_epi16(a, b) }; assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16)); } @@ -2353,7 +2570,7 @@ mod tests { fn _mm_cmpgt_epi32() { let a = i32x4::splat(0).replace(0, 5); let b = i32x4::splat(0); - let r = sse2::_mm_cmpgt_epi32(a, b); + let r = unsafe { sse2::_mm_cmpgt_epi32(a, b) }; assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); } @@ -2361,7 +2578,7 @@ mod tests { fn _mm_cmplt_epi8() { let a = i8x16::splat(0); let b = i8x16::splat(0).replace(0, 5); - let r = sse2::_mm_cmplt_epi8(a, b); + let r = unsafe { sse2::_mm_cmplt_epi8(a, b) }; assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8)); } @@ -2369,7 +2586,7 @@ mod tests { fn _mm_cmplt_epi16() { let a = i16x8::splat(0); let b = i16x8::splat(0).replace(0, 5); - let r = sse2::_mm_cmplt_epi16(a, b); + let r = unsafe { sse2::_mm_cmplt_epi16(a, b) }; assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16)); } @@ -2377,123 +2594,161 @@ mod tests { fn _mm_cmplt_epi32() { let a = i32x4::splat(0); let b = i32x4::splat(0).replace(0, 5); - let r = sse2::_mm_cmplt_epi32(a, b); + let r = unsafe { sse2::_mm_cmplt_epi32(a, b) }; assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); } #[simd_test = "sse2"] fn _mm_cvtepi32_pd() { - let a = sse2::_mm_set_epi32(35, 25, 15, 5); - let r = sse2::_mm_cvtepi32_pd(a); + let a = unsafe { sse2::_mm_set_epi32(35, 25, 15, 5) }; + let r = unsafe { sse2::_mm_cvtepi32_pd(a) }; assert_eq!(r, f64x2::new(5.0, 15.0)); } #[simd_test = "sse2"] fn _mm_cvtsi32_sd() { let a = f64x2::splat(3.5); - assert_eq!(sse2::_mm_cvtsi32_sd(a, 5), f64x2::new(5.0, 3.5)); + let r = unsafe { sse2::_mm_cvtsi32_sd(a, 5) }; + assert_eq!(r, f64x2::new(5.0, 3.5)); } + #[cfg(target_arch = "x86_64")] #[simd_test = "sse2"] fn _mm_cvtsi64_sd() { let a = f64x2::splat(3.5); - assert_eq!(sse2::_mm_cvtsi64_sd(a, 5), f64x2::new(5.0, 3.5)); + let r = unsafe { sse2::_mm_cvtsi64_sd(a, 5) }; + assert_eq!(r, f64x2::new(5.0, 3.5)); } #[simd_test = "sse2"] fn _mm_cvtepi32_ps() { let a = i32x4::new(1, 2, 3, 4); - assert_eq!(sse2::_mm_cvtepi32_ps(a), f32x4::new(1.0, 2.0, 3.0, 4.0)); + let r = unsafe { sse2::_mm_cvtepi32_ps(a) }; + assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0)); } #[simd_test = "sse2"] fn _mm_cvtsi32_si128() { - assert_eq!(sse2::_mm_cvtsi32_si128(5), i32x4::new(5, 0, 0, 0)); + let r = unsafe { sse2::_mm_cvtsi32_si128(5) }; + assert_eq!(r, i32x4::new(5, 0, 0, 0)); } + #[cfg(target_arch = "x86_64")] #[simd_test = "sse2"] fn _mm_cvtsi64_si128() { - assert_eq!(sse2::_mm_cvtsi64_si128(5), i64x2::new(5, 0)); + let r = unsafe { sse2::_mm_cvtsi64_si128(5) }; + assert_eq!(r, i64x2::new(5, 0)); } #[simd_test = "sse2"] fn _mm_cvtsi128_si32() { - assert_eq!(sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)), 5); + let r = unsafe { sse2::_mm_cvtsi128_si32(i32x4::new(5, 0, 0, 0)) }; + assert_eq!(r, 5); } + #[cfg(target_arch = "x86_64")] #[simd_test = "sse2"] fn _mm_cvtsi128_si64() { - assert_eq!(sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)), 5); + let r = unsafe { sse2::_mm_cvtsi128_si64(i64x2::new(5, 0)) }; + assert_eq!(r, 5); } #[simd_test = "sse2"] fn _mm_set_epi64x() { - assert_eq!(sse2::_mm_set_epi64x(0, 1), i64x2::new(1, 0)); + let r = unsafe { sse2::_mm_set_epi64x(0, 1) }; + assert_eq!(r, i64x2::new(1, 0)); } #[simd_test = "sse2"] fn _mm_set_epi32() { - assert_eq!(sse2::_mm_set_epi32(0, 1, 2, 3), i32x4::new(3, 2, 1, 0)); + let r = unsafe { sse2::_mm_set_epi32(0, 1, 2, 3) }; + assert_eq!(r, i32x4::new(3, 2, 1, 0)); } #[simd_test = "sse2"] fn _mm_set_epi16() { - assert_eq!( - sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7), - i16x8::new(7, 6, 5, 4, 3, 2, 1, 0)); + let r = unsafe { sse2::_mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7) }; + assert_eq!(r, i16x8::new(7, 6, 5, 4, 3, 2, 1, 0)); } #[simd_test = "sse2"] fn _mm_set_epi8() { - assert_eq!( + let r = unsafe { sse2::_mm_set_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + ) + }; + let e = i8x16::new( + 15, 14, 13, 12, + 11, 10, 9, 8, + 7, 6, 5, 4, + 3, 2, 1, 0, + ); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_set1_epi64x() { - assert_eq!(sse2::_mm_set1_epi64x(1), i64x2::splat(1)); + let r = unsafe { sse2::_mm_set1_epi64x(1) }; + assert_eq!(r, i64x2::splat(1)); } #[simd_test = "sse2"] fn _mm_set1_epi32() { - assert_eq!(sse2::_mm_set1_epi32(1), i32x4::splat(1)); + let r = unsafe { sse2::_mm_set1_epi32(1) }; + assert_eq!(r, i32x4::splat(1)); } #[simd_test = "sse2"] fn _mm_set1_epi16() { - assert_eq!(sse2::_mm_set1_epi16(1), i16x8::splat(1)); + let r = unsafe { sse2::_mm_set1_epi16(1) }; + assert_eq!(r, i16x8::splat(1)); } #[simd_test = "sse2"] fn _mm_set1_epi8() { - assert_eq!(sse2::_mm_set1_epi8(1), i8x16::splat(1)); + let r = unsafe { sse2::_mm_set1_epi8(1) }; + assert_eq!(r, i8x16::splat(1)); } #[simd_test = "sse2"] fn _mm_setr_epi32() { - assert_eq!(sse2::_mm_setr_epi32(0, 1, 2, 3), i32x4::new(0, 1, 2, 3)); + let r = unsafe { sse2::_mm_setr_epi32(0, 1, 2, 3) }; + assert_eq!(r, i32x4::new(0, 1, 2, 3)); } #[simd_test = "sse2"] fn _mm_setr_epi16() { - assert_eq!( - sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), - i16x8::new(0, 1, 2, 3, 4, 5, 6, 7)); + let r = unsafe { sse2::_mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7) }; + assert_eq!(r, i16x8::new(0, 1, 2, 3, 4, 5, 6, 7)); } #[simd_test = "sse2"] fn _mm_setr_epi8() { - assert_eq!( + let r = unsafe { sse2::_mm_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + ) + }; + let e = i8x16::new( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + ); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_setzero_si128() { - assert_eq!(sse2::_mm_setzero_si128(), __m128i::from(i64x2::splat(0))); + let r = unsafe { sse2::_mm_setzero_si128() }; + assert_eq!(r, __m128i::from(i64x2::splat(0))); } #[simd_test = "sse2"] @@ -2505,14 +2760,14 @@ mod tests { #[simd_test = "sse2"] fn _mm_load_si128() { - let a = sse2::_mm_set_epi64x(5, 6); + let a = unsafe { sse2::_mm_set_epi64x(5, 6) }; let r = unsafe { sse2::_mm_load_si128(&a as *const _ as *const _) }; assert_eq!(a, i64x2::from(r)); } #[simd_test = "sse2"] fn _mm_loadu_si128() { - let a = sse2::_mm_set_epi64x(5, 6); + let a = unsafe { sse2::_mm_set_epi64x(5, 6) }; let r = unsafe { sse2::_mm_loadu_si128(&a as *const _ as *const _) }; assert_eq!(a, i64x2::from(r)); } @@ -2561,14 +2816,15 @@ mod tests { #[simd_test = "sse2"] fn _mm_move_epi64() { let a = i64x2::new(5, 6); - assert_eq!(sse2::_mm_move_epi64(a), i64x2::new(5, 0)); + let r = unsafe { sse2::_mm_move_epi64(a) }; + assert_eq!(r, i64x2::new(5, 0)); } #[simd_test = "sse2"] fn _mm_packs_epi16() { let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0); let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80); - let r = sse2::_mm_packs_epi16(a, b); + let r = unsafe { sse2::_mm_packs_epi16(a, b) }; assert_eq!(r, i8x16::new( 0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F)); @@ -2578,7 +2834,7 @@ mod tests { fn _mm_packs_epi32() { let a = i32x4::new(0x8000, -0x8001, 0, 0); let b = i32x4::new(0, 0, -0x8001, 0x8000); - let r = sse2::_mm_packs_epi32(a, b); + let r = unsafe { sse2::_mm_packs_epi32(a, b) }; assert_eq!( r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF)); } @@ -2587,7 +2843,7 @@ mod tests { fn _mm_packus_epi16() { let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0); let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100); - let r = sse2::_mm_packus_epi16(a, b); + let r = unsafe { sse2::_mm_packus_epi16(a, b) }; assert_eq!(r, u8x16::new( 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF)); @@ -2596,13 +2852,15 @@ mod tests { #[simd_test = "sse2"] fn _mm_extract_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq!(sse2::_mm_extract_epi16(a, 5), 5); + let r = unsafe { sse2::_mm_extract_epi16(a, 5) }; + assert_eq!(r, 5); } #[simd_test = "sse2"] fn _mm_insert_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9)); + let r = unsafe { sse2::_mm_insert_epi16(a, 9, 0) }; + assert_eq!(r, a.replace(0, 9)); } #[simd_test = "sse2"] @@ -2610,28 +2868,32 @@ mod tests { let a = i8x16::from(u8x16::new( 0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0, 0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000)); - assert_eq!(sse2::_mm_movemask_epi8(a), 0b10100100_00100101); + let r = unsafe { sse2::_mm_movemask_epi8(a) }; + assert_eq!(r, 0b10100100_00100101); } #[simd_test = "sse2"] fn _mm_shuffle_epi32() { let a = i32x4::new(5, 10, 15, 20); + let r = unsafe { sse2::_mm_shuffle_epi32(a, 0b00_01_01_11) }; let e = i32x4::new(20, 10, 10, 5); - assert_eq!(sse2::_mm_shuffle_epi32(a, 0b00_01_01_11), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_shufflehi_epi16() { let a = i16x8::new(1, 2, 3, 4, 5, 10, 15, 20); + let r = unsafe { sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11) }; let e = i16x8::new(1, 2, 3, 4, 20, 10, 10, 5); - assert_eq!(sse2::_mm_shufflehi_epi16(a, 0b00_01_01_11), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_shufflelo_epi16() { let a = i16x8::new(5, 10, 15, 20, 1, 2, 3, 4); + let r = unsafe { sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11) }; let e = i16x8::new(20, 10, 10, 5, 1, 2, 3, 4); - assert_eq!(sse2::_mm_shufflelo_epi16(a, 0b00_01_01_11), e); + assert_eq!(r, e); } #[simd_test = "sse2"] @@ -2640,33 +2902,37 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i8x16::new( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = unsafe { sse2::_mm_unpackhi_epi8(a, b) }; let e = i8x16::new( 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - assert_eq!(sse2::_mm_unpackhi_epi8(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_unpackhi_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15); + let r = unsafe { sse2::_mm_unpackhi_epi16(a, b) }; let e = i16x8::new(4, 12, 5, 13, 6, 14, 7, 15); - assert_eq!(sse2::_mm_unpackhi_epi16(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_unpackhi_epi32() { let a = i32x4::new(0, 1, 2, 3); let b = i32x4::new(4, 5, 6, 7); + let r = unsafe { sse2::_mm_unpackhi_epi32(a, b) }; let e = i32x4::new(2, 6, 3, 7); - assert_eq!(sse2::_mm_unpackhi_epi32(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_unpackhi_epi64() { let a = i64x2::new(0, 1); let b = i64x2::new(2, 3); + let r = unsafe { sse2::_mm_unpackhi_epi64(a, b) }; let e = i64x2::new(1, 3); - assert_eq!(sse2::_mm_unpackhi_epi64(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] @@ -2675,131 +2941,147 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i8x16::new( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = unsafe { sse2::_mm_unpacklo_epi8(a, b) }; let e = i8x16::new( 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - assert_eq!(sse2::_mm_unpacklo_epi8(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_unpacklo_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15); + let r = unsafe { sse2::_mm_unpacklo_epi16(a, b) }; let e = i16x8::new(0, 8, 1, 9, 2, 10, 3, 11); - assert_eq!(sse2::_mm_unpacklo_epi16(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_unpacklo_epi32() { let a = i32x4::new(0, 1, 2, 3); let b = i32x4::new(4, 5, 6, 7); + let r = unsafe { sse2::_mm_unpacklo_epi32(a, b) }; let e = i32x4::new(0, 4, 1, 5); - assert_eq!(sse2::_mm_unpacklo_epi32(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_unpacklo_epi64() { let a = i64x2::new(0, 1); let b = i64x2::new(2, 3); + let r = unsafe { sse2::_mm_unpacklo_epi64(a, b) }; let e = i64x2::new(0, 2); - assert_eq!(sse2::_mm_unpacklo_epi64(a, b), e); + assert_eq!(r, e); } #[simd_test = "sse2"] fn _mm_add_sd() { - assert_eq!( - sse2::_mm_add_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(6.0, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_add_sd(a, b) }; + assert_eq!(r, f64x2::new(6.0, 2.0)); } #[simd_test = "sse2"] fn _mm_add_pd() { - assert_eq!( - sse2::_mm_add_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(6.0, 12.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_add_pd(a, b) }; + assert_eq!(r, f64x2::new(6.0, 12.0)); } #[simd_test = "sse2"] fn _mm_div_sd() { - assert_eq!( - sse2::_mm_div_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(0.2, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_div_sd(a, b) }; + assert_eq!(r, f64x2::new(0.2, 2.0)); } #[simd_test = "sse2"] fn _mm_div_pd() { - assert_eq!( - sse2::_mm_div_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(0.2, 0.2)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_div_pd(a, b) }; + assert_eq!(r, f64x2::new(0.2, 0.2)); } #[simd_test = "sse2"] fn _mm_max_sd() { - assert_eq!( - sse2::_mm_max_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(5.0, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_max_sd(a, b) }; + assert_eq!(r, f64x2::new(5.0, 2.0)); } #[simd_test = "sse2"] fn _mm_max_pd() { - assert_eq!( - sse2::_mm_max_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(5.0, 10.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_max_pd(a, b) }; + assert_eq!(r, f64x2::new(5.0, 10.0)); } #[simd_test = "sse2"] fn _mm_min_sd() { - assert_eq!( - sse2::_mm_min_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(1.0, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_min_sd(a, b) }; + assert_eq!(r, f64x2::new(1.0, 2.0)); } #[simd_test = "sse2"] fn _mm_min_pd() { - assert_eq!( - sse2::_mm_min_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(1.0, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_min_pd(a, b) }; + assert_eq!(r, f64x2::new(1.0, 2.0)); } #[simd_test = "sse2"] fn _mm_mul_sd() { - assert_eq!( - sse2::_mm_mul_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(5.0, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_mul_sd(a, b) }; + assert_eq!(r, f64x2::new(5.0, 2.0)); } #[simd_test = "sse2"] fn _mm_mul_pd() { - assert_eq!( - sse2::_mm_mul_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(5.0, 20.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_mul_pd(a, b) }; + assert_eq!(r, f64x2::new(5.0, 20.0)); } #[simd_test = "sse2"] fn _mm_sqrt_sd() { - assert_eq!( - sse2::_mm_sqrt_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(5.0f64.sqrt(), 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_sqrt_sd(a, b) }; + assert_eq!(r, f64x2::new(5.0f64.sqrt(), 2.0)); } #[simd_test = "sse2"] fn _mm_sqrt_pd() { - assert_eq!( - sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)), - f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt())); + let r = unsafe { sse2::_mm_sqrt_pd(f64x2::new(1.0, 2.0)) }; + assert_eq!(r, f64x2::new(1.0f64.sqrt(), 2.0f64.sqrt())); } #[simd_test = "sse2"] fn _mm_sub_sd() { - assert_eq!( - sse2::_mm_sub_sd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(-4.0, 2.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_sub_sd(a, b) }; + assert_eq!(r, f64x2::new(-4.0, 2.0)); } #[simd_test = "sse2"] fn _mm_sub_pd() { - assert_eq!( - sse2::_mm_sub_pd(f64x2::new(1.0, 2.0), f64x2::new(5.0, 10.0)), - f64x2::new(-4.0, -8.0)); + let a = f64x2::new(1.0, 2.0); + let b = f64x2::new(5.0, 10.0); + let r = unsafe { sse2::_mm_sub_pd(a, b) }; + assert_eq!(r, f64x2::new(-4.0, -8.0)); } #[simd_test = "sse2"] @@ -2809,8 +3091,9 @@ mod tests { unsafe { let a: f64x2 = transmute(i64x2::splat(5)); let b: f64x2 = transmute(i64x2::splat(3)); + let r = sse2::_mm_and_pd(a, b); let e: f64x2 = transmute(i64x2::splat(1)); - assert_eq!(sse2::_mm_and_pd(a, b), e); + assert_eq!(r, e); } } @@ -2821,8 +3104,9 @@ mod tests { unsafe { let a: f64x2 = transmute(i64x2::splat(5)); let b: f64x2 = transmute(i64x2::splat(3)); + let r = sse2::_mm_andnot_pd(a, b); let e: f64x2 = transmute(i64x2::splat(2)); - assert_eq!(sse2::_mm_andnot_pd(a, b), e); + assert_eq!(r, e); } } @@ -2833,8 +3117,9 @@ mod tests { unsafe { let a: f64x2 = transmute(i64x2::splat(5)); let b: f64x2 = transmute(i64x2::splat(3)); + let r = sse2::_mm_or_pd(a, b); let e: f64x2 = transmute(i64x2::splat(7)); - assert_eq!(sse2::_mm_or_pd(a, b), e); + assert_eq!(r, e); } } @@ -2845,8 +3130,9 @@ mod tests { unsafe { let a: f64x2 = transmute(i64x2::splat(5)); let b: f64x2 = transmute(i64x2::splat(3)); + let r = sse2::_mm_xor_pd(a, b); let e: f64x2 = transmute(i64x2::splat(6)); - assert_eq!(sse2::_mm_xor_pd(a, b), e); + assert_eq!(r, e); } } @@ -3147,40 +3433,40 @@ mod tests { use std::f64::NAN; let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(sse2::_mm_comieq_sd(a, b)); + assert!(unsafe { sse2::_mm_comieq_sd(a, b) }); let (a, b) = (f64x2::new(NAN, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_comieq_sd(a, b)); + assert!(unsafe { !sse2::_mm_comieq_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_comilt_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_comilt_sd(a, b)); + assert!(unsafe { !sse2::_mm_comilt_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_comile_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(sse2::_mm_comile_sd(a, b)); + assert!(unsafe { sse2::_mm_comile_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_comigt_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_comigt_sd(a, b)); + assert!(unsafe { !sse2::_mm_comigt_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_comige_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(sse2::_mm_comige_sd(a, b)); + assert!(unsafe { sse2::_mm_comige_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_comineq_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_comineq_sd(a, b)); + assert!(unsafe { !sse2::_mm_comineq_sd(a, b) }); } #[simd_test = "sse2"] @@ -3188,48 +3474,48 @@ mod tests { use std::f64::NAN; let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(sse2::_mm_ucomieq_sd(a, b)); + assert!(unsafe { sse2::_mm_ucomieq_sd(a, b) }); let (a, b) = (f64x2::new(NAN, 2.0), f64x2::new(NAN, 3.0)); - assert!(!sse2::_mm_ucomieq_sd(a, b)); + assert!(unsafe { !sse2::_mm_ucomieq_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_ucomilt_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_ucomilt_sd(a, b)); + assert!(unsafe { !sse2::_mm_ucomilt_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_ucomile_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(sse2::_mm_ucomile_sd(a, b)); + assert!(unsafe { sse2::_mm_ucomile_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_ucomigt_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_ucomigt_sd(a, b)); + assert!(unsafe { !sse2::_mm_ucomigt_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_ucomige_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(sse2::_mm_ucomige_sd(a, b)); + assert!(unsafe { sse2::_mm_ucomige_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_ucomineq_sd() { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); - assert!(!sse2::_mm_ucomineq_sd(a, b)); + assert!(unsafe { !sse2::_mm_ucomineq_sd(a, b) }); } #[simd_test = "sse2"] fn _mm_movemask_pd() { - let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0)); + let r = unsafe { sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0)) }; assert_eq!(r, 0b01); - let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0)); + let r = unsafe { sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0)) }; assert_eq!(r, 0b11); } } diff --git a/library/stdarch/src/x86/sse41.rs b/library/stdarch/src/x86/sse41.rs index 0ca528a80b15..6b81b2d51b06 100644 --- a/library/stdarch/src/x86/sse41.rs +++ b/library/stdarch/src/x86/sse41.rs @@ -1,18 +1,18 @@ -use v128::*; -use x86::__m128i; - #[cfg(test)] use stdsimd_test::assert_instr; +use v128::*; +use x86::__m128i; + #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pblendvb))] -pub fn _mm_blendv_epi8( +pub unsafe fn _mm_blendv_epi8( a: __m128i, b: __m128i, mask: __m128i, ) -> __m128i { - unsafe { pblendvb(a, b, mask) } + pblendvb(a, b, mask) } /// Returns the dot product of two f64x2 vectors. @@ -24,15 +24,20 @@ pub fn _mm_blendv_epi8( /// the broadcast mask bit is zero then the return component will be zero. #[inline(always)] #[target_feature = "+sse4.1"] -pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { +pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { macro_rules! call { - ($imm8:expr) => { - unsafe { dppd(a, b, $imm8) } - } + ($imm8:expr) => { dppd(a, b, $imm8) } } constify_imm8!(imm8, call) } +#[cfg(test)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(dppd))] +fn _test_mm_dp_pd(a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_dp_pd(a, b, 0) } +} + /// Returns the dot product of two f32x4 vectors. /// /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask. @@ -42,15 +47,20 @@ pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { /// the broadcast mask bit is zero then the return component will be zero. #[inline(always)] #[target_feature = "+sse4.1"] -pub fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { +pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { macro_rules! call { - ($imm8:expr) => { - unsafe { dpps(a, b, $imm8) } - } + ($imm8:expr) => { dpps(a, b, $imm8) } } constify_imm8!(imm8, call) } +#[cfg(test)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(dpps))] +fn _test_mm_dp_ps(a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_dp_ps(a, b, 0) } +} + #[allow(improper_ctypes)] extern { #[link_name = "llvm.x86.sse41.pblendvb"] @@ -78,7 +88,7 @@ mod tests { 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); let e = i8x16::new( 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31); - assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e); + assert_eq!(unsafe { sse41::_mm_blendv_epi8(a, b, mask) }, e); } #[simd_test = "sse4.1"] @@ -86,7 +96,7 @@ mod tests { let a = f64x2::new(2.0, 3.0); let b = f64x2::new(1.0, 4.0); let e = f64x2::new(14.0, 0.0); - assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e); + assert_eq!(unsafe { sse41::_mm_dp_pd(a, b, 0b00110001) }, e); } #[simd_test = "sse4.1"] @@ -94,6 +104,6 @@ mod tests { let a = f32x4::new(2.0, 3.0, 1.0, 10.0); let b = f32x4::new(1.0, 4.0, 0.5, 10.0); let e = f32x4::new(14.5, 0.0, 14.5, 0.0); - assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e); + assert_eq!(unsafe { sse41::_mm_dp_ps(a, b, 0b01110101) }, e); } } diff --git a/library/stdarch/src/x86/sse42.rs b/library/stdarch/src/x86/sse42.rs index 22c90ed503a8..70e024f79639 100644 --- a/library/stdarch/src/x86/sse42.rs +++ b/library/stdarch/src/x86/sse42.rs @@ -1,3 +1,6 @@ +#[cfg(test)] +use stdsimd_test::assert_instr; + use x86::__m128i; pub const _SIDD_UBYTE_OPS: i8 = 0b00000000; @@ -19,7 +22,7 @@ pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b01000000; #[inline(always)] #[target_feature = "+sse4.2"] -pub fn _mm_cmpestri( +pub unsafe fn _mm_cmpestri( a: __m128i, la: i32, b: __m128i, @@ -27,13 +30,18 @@ pub fn _mm_cmpestri( imm8: i8, ) -> i32 { macro_rules! call { - ($imm8:expr) => { - unsafe { pcmpestri128(a, la, b, lb, $imm8) } - } + ($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) } } constify_imm8!(imm8, call) } +#[cfg(test)] +#[target_feature = "+sse4.2"] +#[cfg_attr(test, assert_instr(pcmpestri))] +fn _test_mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 { + unsafe { _mm_cmpestri(a, la, b, lb, 0) } +} + #[allow(improper_ctypes)] extern { #[link_name = "llvm.x86.sse42.pcmpestri128"] @@ -53,8 +61,10 @@ mod tests { let b = &b"foobar "[..]; let va = __m128i::from(u8x16::load(a, 0)); let vb = __m128i::from(u8x16::load(b, 0)); - let i = sse42::_mm_cmpestri( - va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED); + let i = unsafe { + sse42::_mm_cmpestri( + va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED) + }; assert_eq!(3, i); } } diff --git a/library/stdarch/src/x86/ssse3.rs b/library/stdarch/src/x86/ssse3.rs index 2ad0a8efe913..1f6c7488a1a1 100644 --- a/library/stdarch/src/x86/ssse3.rs +++ b/library/stdarch/src/x86/ssse3.rs @@ -1,15 +1,15 @@ -use v128::*; - #[cfg(test)] use stdsimd_test::assert_instr; +use v128::*; + /// Compute the absolute value of packed 8-bit signed integers in `a` and /// return the unsigned results. #[inline(always)] #[target_feature = "+ssse3"] #[cfg_attr(test, assert_instr(pabsb))] -pub fn _mm_abs_epi8(a: i8x16) -> u8x16 { - unsafe { pabsb128(a) } +pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 { + pabsb128(a) } /// Shuffle bytes from `a` according to the content of `b`. @@ -39,8 +39,8 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 { #[inline(always)] #[target_feature = "+ssse3"] #[cfg_attr(test, assert_instr(pshufb))] -pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 { - unsafe { pshufb128(a, b) } +pub unsafe fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 { + pshufb128(a, b) } @@ -48,7 +48,6 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 { extern { #[link_name = "llvm.x86.ssse3.pabs.b.128"] fn pabsb128(a: i8x16) -> u8x16; - #[link_name = "llvm.x86.ssse3.pshuf.b.128"] fn pshufb128(a: u8x16, b: u8x16) -> u8x16; } @@ -62,16 +61,31 @@ mod tests { #[simd_test = "ssse3"] fn _mm_abs_epi8() { - let r = ssse3::_mm_abs_epi8(i8x16::splat(-5)); + let r = unsafe { ssse3::_mm_abs_epi8(i8x16::splat(-5)) }; assert_eq!(r, u8x16::splat(5)); } #[simd_test = "ssse3"] fn _mm_shuffle_epi8() { - let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); - let expected = u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1); - let r = ssse3::_mm_shuffle_epi8(a, b); + let a = u8x16::new( + 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16, + ); + let b = u8x16::new( + 4, 128, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); + let expected = u8x16::new( + 5, 0, 5, 4, + 9, 13, 7, 4, + 13, 6, 6, 11, + 5, 2, 9, 1, + ); + let r = unsafe { ssse3::_mm_shuffle_epi8(a, b) }; assert_eq!(r, expected); } } diff --git a/library/stdarch/src/x86/tbm.rs b/library/stdarch/src/x86/tbm.rs index ad8590f115f9..8611eed58172 100644 --- a/library/stdarch/src/x86/tbm.rs +++ b/library/stdarch/src/x86/tbm.rs @@ -65,7 +65,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcfill))] -pub fn _blcfill_u32(x: u32) -> u32 { +pub unsafe fn _blcfill_u32(x: u32) -> u32 { x & (x.wrapping_add(1)) } @@ -76,7 +76,7 @@ pub fn _blcfill_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcfill))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blcfill_u64(x: u64) -> u64 { +pub unsafe fn _blcfill_u64(x: u64) -> u64 { x & (x.wrapping_add(1)) } @@ -86,7 +86,7 @@ pub fn _blcfill_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blci))] -pub fn _blci_u32(x: u32) -> u32 { +pub unsafe fn _blci_u32(x: u32) -> u32 { x | !(x.wrapping_add(1)) } @@ -97,7 +97,7 @@ pub fn _blci_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blci))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blci_u64(x: u64) -> u64 { +pub unsafe fn _blci_u64(x: u64) -> u64 { x | !(x.wrapping_add(1)) } @@ -107,7 +107,7 @@ pub fn _blci_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcic))] -pub fn _blcic_u32(x: u32) -> u32 { +pub unsafe fn _blcic_u32(x: u32) -> u32 { !x & (x.wrapping_add(1)) } @@ -118,7 +118,7 @@ pub fn _blcic_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcic))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blcic_u64(x: u64) -> u64 { +pub unsafe fn _blcic_u64(x: u64) -> u64 { !x & (x.wrapping_add(1)) } @@ -128,7 +128,7 @@ pub fn _blcic_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcmsk))] -pub fn _blcmsk_u32(x: u32) -> u32 { +pub unsafe fn _blcmsk_u32(x: u32) -> u32 { x ^ (x.wrapping_add(1)) } @@ -139,7 +139,7 @@ pub fn _blcmsk_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blcmsk_u64(x: u64) -> u64 { +pub unsafe fn _blcmsk_u64(x: u64) -> u64 { x ^ (x.wrapping_add(1)) } @@ -149,7 +149,7 @@ pub fn _blcmsk_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcs))] -pub fn _blcs_u32(x: u32) -> u32 { +pub unsafe fn _blcs_u32(x: u32) -> u32 { x | (x.wrapping_add(1)) } @@ -160,7 +160,7 @@ pub fn _blcs_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blcs))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blcs_u64(x: u64) -> u64 { +pub unsafe fn _blcs_u64(x: u64) -> u64 { x | x.wrapping_add(1) } @@ -170,7 +170,7 @@ pub fn _blcs_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blsfill))] -pub fn _blsfill_u32(x: u32) -> u32 { +pub unsafe fn _blsfill_u32(x: u32) -> u32 { x | (x.wrapping_sub(1)) } @@ -181,7 +181,7 @@ pub fn _blsfill_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blsfill))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blsfill_u64(x: u64) -> u64 { +pub unsafe fn _blsfill_u64(x: u64) -> u64 { x | (x.wrapping_sub(1)) } @@ -191,7 +191,7 @@ pub fn _blsfill_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blsic))] -pub fn _blsic_u32(x: u32) -> u32 { +pub unsafe fn _blsic_u32(x: u32) -> u32 { !x | (x.wrapping_sub(1)) } @@ -202,7 +202,7 @@ pub fn _blsic_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(blsic))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _blsic_u64(x: u64) -> u64 { +pub unsafe fn _blsic_u64(x: u64) -> u64 { !x | (x.wrapping_sub(1)) } @@ -213,7 +213,7 @@ pub fn _blsic_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(t1mskc))] -pub fn _t1mskc_u32(x: u32) -> u32 { +pub unsafe fn _t1mskc_u32(x: u32) -> u32 { !x | (x.wrapping_add(1)) } @@ -225,7 +225,7 @@ pub fn _t1mskc_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(t1mskc))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _t1mskc_u64(x: u64) -> u64 { +pub unsafe fn _t1mskc_u64(x: u64) -> u64 { !x | (x.wrapping_add(1)) } @@ -236,7 +236,7 @@ pub fn _t1mskc_u64(x: u64) -> u64 { #[inline(always)] #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(tzmsk))] -pub fn _tzmsk_u32(x: u32) -> u32 { +pub unsafe fn _tzmsk_u32(x: u32) -> u32 { !x & (x.wrapping_sub(1)) } @@ -248,7 +248,7 @@ pub fn _tzmsk_u32(x: u32) -> u32 { #[target_feature = "+tbm"] #[cfg_attr(test, assert_instr(tzmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions -pub fn _tzmsk_u64(x: u64) -> u64 { +pub unsafe fn _tzmsk_u64(x: u64) -> u64 { !x & (x.wrapping_sub(1)) } @@ -272,122 +272,174 @@ mod tests { #[simd_test = "tbm"] fn _blcfill_u32() { - assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32); - assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32); + assert_eq!( + unsafe { tbm::_blcfill_u32(0b0101_0111u32) }, + 0b0101_0000u32); + assert_eq!( + unsafe { tbm::_blcfill_u32(0b1111_1111u32) }, + 0u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blcfill_u64() { - assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64); - assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64); + assert_eq!( + unsafe { tbm::_blcfill_u64(0b0101_0111u64) }, + 0b0101_0000u64); + assert_eq!( + unsafe { tbm::_blcfill_u64(0b1111_1111u64) }, + 0u64); } #[simd_test = "tbm"] fn _blci_u32() { - assert_eq!(tbm::_blci_u32(0b0101_0000u32), - 0b1111_1111_1111_1111_1111_1111_1111_1110u32); - assert_eq!(tbm::_blci_u32(0b1111_1111u32), - 0b1111_1111_1111_1111_1111_1110_1111_1111u32); + assert_eq!( + unsafe { tbm::_blci_u32(0b0101_0000u32) }, + 0b1111_1111_1111_1111_1111_1111_1111_1110u32); + assert_eq!( + unsafe { tbm::_blci_u32(0b1111_1111u32) }, + 0b1111_1111_1111_1111_1111_1110_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blci_u64() { - assert_eq!(tbm::_blci_u64(0b0101_0000u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64); - assert_eq!(tbm::_blci_u64(0b1111_1111u64), - 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64); + assert_eq!( + unsafe { tbm::_blci_u64(0b0101_0000u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64); + assert_eq!( + unsafe { tbm::_blci_u64(0b1111_1111u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64); } #[simd_test = "tbm"] fn _blcic_u32() { - assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32); - assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32); + assert_eq!( + unsafe { tbm::_blcic_u32(0b0101_0001u32) }, + 0b0000_0010u32); + assert_eq!( + unsafe { tbm::_blcic_u32(0b1111_1111u32) }, + 0b1_0000_0000u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blcic_u64() { - assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64); - assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64); + assert_eq!( + unsafe { tbm::_blcic_u64(0b0101_0001u64) }, + 0b0000_0010u64); + assert_eq!( + unsafe { tbm::_blcic_u64(0b1111_1111u64) }, + 0b1_0000_0000u64); } #[simd_test = "tbm"] fn _blcmsk_u32() { - assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32); - assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32); + assert_eq!( + unsafe { tbm::_blcmsk_u32(0b0101_0001u32) }, + 0b0000_0011u32); + assert_eq!( + unsafe { tbm::_blcmsk_u32(0b1111_1111u32) }, + 0b1_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blcmsk_u64() { - assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64); - assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64); + assert_eq!( + unsafe { tbm::_blcmsk_u64(0b0101_0001u64) }, + 0b0000_0011u64); + assert_eq!( + unsafe { tbm::_blcmsk_u64(0b1111_1111u64) }, + 0b1_1111_1111u64); } #[simd_test = "tbm"] fn _blcs_u32() { - assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32); - assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32); + assert_eq!(unsafe { tbm::_blcs_u32(0b0101_0001u32) }, 0b0101_0011u32); + assert_eq!(unsafe { tbm::_blcs_u32(0b1111_1111u32) }, 0b1_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blcs_u64() { - assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64); - assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64); + assert_eq!(unsafe { tbm::_blcs_u64(0b0101_0001u64) }, 0b0101_0011u64); + assert_eq!(unsafe { tbm::_blcs_u64(0b1111_1111u64) }, 0b1_1111_1111u64); } #[simd_test = "tbm"] fn _blsfill_u32() { - assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32); - assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + assert_eq!( + unsafe { tbm::_blsfill_u32(0b0101_0100u32) }, + 0b0101_0111u32); + assert_eq!( + unsafe { tbm::_blsfill_u32(0u32) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blsfill_u64() { - assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64); - assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + assert_eq!( + unsafe { tbm::_blsfill_u64(0b0101_0100u64) }, + 0b0101_0111u64); + assert_eq!( + unsafe { tbm::_blsfill_u64(0u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); } #[simd_test = "tbm"] fn _blsic_u32() { - assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32); - assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + assert_eq!( + unsafe { tbm::_blsic_u32(0b0101_0100u32) }, + 0b1111_1111_1111_1111_1111_1111_1111_1011u32); + assert_eq!( + unsafe { tbm::_blsic_u32(0u32) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _blsic_u64() { - assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64); - assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + assert_eq!( + unsafe { tbm::_blsic_u64(0b0101_0100u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64); + assert_eq!( + unsafe { tbm::_blsic_u64(0u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); } #[simd_test = "tbm"] fn _t1mskc_u32() { - assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32); - assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32); + assert_eq!( + unsafe { tbm::_t1mskc_u32(0b0101_0111u32) }, + 0b1111_1111_1111_1111_1111_1111_1111_1000u32); + assert_eq!( + unsafe { tbm::_t1mskc_u32(0u32) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _t1mksc_u64() { - assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64); - assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); + assert_eq!( + unsafe { tbm::_t1mskc_u64(0b0101_0111u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64); + assert_eq!( + unsafe { tbm::_t1mskc_u64(0u64) }, + 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64); } #[simd_test = "tbm"] fn _tzmsk_u32() { - assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32); - assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32); + assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1000u32) }, 0b0000_0111u32); + assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1001u32) }, 0b0000_0000u32); } #[simd_test = "tbm"] #[cfg(not(target_arch = "x86"))] fn _tzmsk_u64() { - assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64); - assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64); + assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1000u64) }, 0b0000_0111u64); + assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1001u64) }, 0b0000_0000u64); } }