diff --git a/library/stdarch/coresimd/src/aarch64/neon.rs b/library/stdarch/coresimd/src/aarch64/neon.rs index 353a5987c725..047fe2c4a7c4 100644 --- a/library/stdarch/coresimd/src/aarch64/neon.rs +++ b/library/stdarch/coresimd/src/aarch64/neon.rs @@ -8,7 +8,7 @@ use simd_llvm::simd_add; use v128::f64x2; /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fadd))] pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 { @@ -16,7 +16,7 @@ pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fadd))] pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 { @@ -24,7 +24,7 @@ pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 { @@ -32,7 +32,7 @@ pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 { diff --git a/library/stdarch/coresimd/src/aarch64/v8.rs b/library/stdarch/coresimd/src/aarch64/v8.rs index 55a352bcd8a3..e9c00338bd36 100644 --- a/library/stdarch/coresimd/src/aarch64/v8.rs +++ b/library/stdarch/coresimd/src/aarch64/v8.rs @@ -9,14 +9,14 @@ use stdsimd_test::assert_instr; /// Reverse the order of the bytes. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rev))] pub unsafe fn _rev_u64(x: u64) -> u64 { x.swap_bytes() as u64 } /// Count Leading Zeros. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(clz))] pub unsafe fn _clz_u64(x: u64) -> u64 { x.leading_zeros() as u64 @@ -29,7 +29,7 @@ extern "C" { } /// Reverse the bit order. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rbit))] pub unsafe fn _rbit_u64(x: u64) -> u64 { rbit_u64(x as i64) as u64 @@ -39,7 +39,7 @@ pub unsafe fn _rbit_u64(x: u64) -> u64 { /// /// When all bits of the operand are set it returns the size of the operand in /// bits. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(cls))] pub unsafe fn _cls_u32(x: u32) -> u32 { u32::leading_zeros((((((x as i32) >> 31) as u32) ^ x) << 1) | 1) as u32 @@ -49,7 +49,7 @@ pub unsafe fn _cls_u32(x: u32) -> u32 { /// /// When all bits of the operand are set it returns the size of the operand in /// bits. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(cls))] pub unsafe fn _cls_u64(x: u64) -> u64 { u64::leading_zeros((((((x as i64) >> 63) as u64) ^ x) << 1) | 1) as u64 diff --git a/library/stdarch/coresimd/src/arm/neon.rs b/library/stdarch/coresimd/src/arm/neon.rs index 0c4efae29f4a..858594ccd497 100644 --- a/library/stdarch/coresimd/src/arm/neon.rs +++ b/library/stdarch/coresimd/src/arm/neon.rs @@ -9,7 +9,7 @@ use v64::{f32x2, i16x4, i32x2, i8x8, u16x4, u32x2, u8x8}; use v128::{f32x4, i16x8, i32x4, i64x2, i8x16, u16x8, u32x4, u64x2, u8x16}; /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 { @@ -17,7 +17,7 @@ pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 { @@ -25,7 +25,7 @@ pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 { @@ -33,7 +33,7 @@ pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 { @@ -41,7 +41,7 @@ pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 { @@ -49,7 +49,7 @@ pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 { @@ -57,7 +57,7 @@ pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 { @@ -65,7 +65,7 @@ pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 { @@ -73,7 +73,7 @@ pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 { @@ -81,7 +81,7 @@ pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 { @@ -89,7 +89,7 @@ pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 { @@ -97,7 +97,7 @@ pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 { @@ -105,7 +105,7 @@ pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 { @@ -113,7 +113,7 @@ pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 { @@ -121,7 +121,7 @@ pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fadd))] pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 { @@ -129,7 +129,7 @@ pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 { } /// Vector add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fadd))] pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 { @@ -137,7 +137,7 @@ pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 { } /// Vector long add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(saddl))] pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 { @@ -147,7 +147,7 @@ pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 { } /// Vector long add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(saddl))] pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 { @@ -157,7 +157,7 @@ pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 { } /// Vector long add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(saddl))] pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 { @@ -167,7 +167,7 @@ pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 { } /// Vector long add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uaddl))] pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 { @@ -177,7 +177,7 @@ pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 { } /// Vector long add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uaddl))] pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 { @@ -187,7 +187,7 @@ pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 { } /// Vector long add. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uaddl))] pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 { @@ -205,7 +205,7 @@ extern "C" { } /// Reciprocal square-root estimate. -#[inline(always)] +#[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(frsqrte))] pub unsafe fn vrsqrte_f32(a: f32x2) -> f32x2 { diff --git a/library/stdarch/coresimd/src/arm/v6.rs b/library/stdarch/coresimd/src/arm/v6.rs index 33fdda67e940..c2011fba7857 100644 --- a/library/stdarch/coresimd/src/arm/v6.rs +++ b/library/stdarch/coresimd/src/arm/v6.rs @@ -10,14 +10,14 @@ use stdsimd_test::assert_instr; /// Reverse the order of the bytes. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rev))] pub unsafe fn _rev_u16(x: u16) -> u16 { x.swap_bytes() as u16 } /// Reverse the order of the bytes. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rev))] pub unsafe fn _rev_u32(x: u32) -> u32 { x.swap_bytes() as u32 diff --git a/library/stdarch/coresimd/src/arm/v7.rs b/library/stdarch/coresimd/src/arm/v7.rs index b62001311430..f8a735f157b1 100644 --- a/library/stdarch/coresimd/src/arm/v7.rs +++ b/library/stdarch/coresimd/src/arm/v7.rs @@ -13,28 +13,28 @@ pub use super::v6::*; use stdsimd_test::assert_instr; /// Count Leading Zeros. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(clz))] pub unsafe fn _clz_u8(x: u8) -> u8 { x.leading_zeros() as u8 } /// Count Leading Zeros. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(clz))] pub unsafe fn _clz_u16(x: u16) -> u16 { x.leading_zeros() as u16 } /// Count Leading Zeros. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(clz))] pub unsafe fn _clz_u32(x: u32) -> u32 { x.leading_zeros() as u32 } /// Reverse the bit order. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rbit))] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc diff --git a/library/stdarch/coresimd/src/nvptx/mod.rs b/library/stdarch/coresimd/src/nvptx/mod.rs index 21248e31938a..8a444b43aefc 100644 --- a/library/stdarch/coresimd/src/nvptx/mod.rs +++ b/library/stdarch/coresimd/src/nvptx/mod.rs @@ -42,79 +42,79 @@ extern "C" { } /// Synchronizes all threads in the block. -#[inline(always)] +#[inline] pub unsafe fn _syncthreads() -> () { syncthreads() } /// x-th thread-block dimension. -#[inline(always)] +#[inline] pub unsafe fn _block_dim_x() -> i32 { block_dim_x() } /// y-th thread-block dimension. -#[inline(always)] +#[inline] pub unsafe fn _block_dim_y() -> i32 { block_dim_y() } /// z-th thread-block dimension. -#[inline(always)] +#[inline] pub unsafe fn _block_dim_z() -> i32 { block_dim_z() } /// x-th thread-block index. -#[inline(always)] +#[inline] pub unsafe fn _block_idx_x() -> i32 { block_idx_x() } /// y-th thread-block index. -#[inline(always)] +#[inline] pub unsafe fn _block_idx_y() -> i32 { block_idx_y() } /// z-th thread-block index. -#[inline(always)] +#[inline] pub unsafe fn _block_idx_z() -> i32 { block_idx_z() } /// x-th block-grid dimension. -#[inline(always)] +#[inline] pub unsafe fn _grid_dim_x() -> i32 { grid_dim_x() } /// y-th block-grid dimension. -#[inline(always)] +#[inline] pub unsafe fn _grid_dim_y() -> i32 { grid_dim_y() } /// z-th block-grid dimension. -#[inline(always)] +#[inline] pub unsafe fn _grid_dim_z() -> i32 { grid_dim_z() } /// x-th thread index. -#[inline(always)] +#[inline] pub unsafe fn _thread_idx_x() -> i32 { thread_idx_x() } /// y-th thread index. -#[inline(always)] +#[inline] pub unsafe fn _thread_idx_y() -> i32 { thread_idx_y() } /// z-th thread index. -#[inline(always)] +#[inline] pub unsafe fn _thread_idx_z() -> i32 { thread_idx_z() } diff --git a/library/stdarch/coresimd/src/x86/i386/fxsr.rs b/library/stdarch/coresimd/src/x86/i386/fxsr.rs index 28c8fb5c2a8d..b67057880a2d 100644 --- a/library/stdarch/coresimd/src/x86/i386/fxsr.rs +++ b/library/stdarch/coresimd/src/x86/i386/fxsr.rs @@ -21,7 +21,7 @@ extern "C" { /// /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html -#[inline(always)] +#[inline] #[target_feature(enable = "fxsr")] #[cfg_attr(test, assert_instr(fxsave))] pub unsafe fn _fxsave(mem_addr: *mut u8) { @@ -42,7 +42,7 @@ pub unsafe fn _fxsave(mem_addr: *mut u8) { /// /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html -#[inline(always)] +#[inline] #[target_feature(enable = "fxsr")] #[cfg_attr(test, assert_instr(fxrstor))] pub unsafe fn _fxrstor(mem_addr: *const u8) { diff --git a/library/stdarch/coresimd/src/x86/i586/abm.rs b/library/stdarch/coresimd/src/x86/i586/abm.rs index 8ee4659d2d01..5480b964ab99 100644 --- a/library/stdarch/coresimd/src/x86/i586/abm.rs +++ b/library/stdarch/coresimd/src/x86/i586/abm.rs @@ -23,7 +23,7 @@ use stdsimd_test::assert_instr; /// Counts the leading most significant zero bits. /// /// When the operand is zero, it returns its size in bits. -#[inline(always)] +#[inline] #[target_feature(enable = "lzcnt")] #[cfg_attr(test, assert_instr(lzcnt))] pub unsafe fn _lzcnt_u32(x: u32) -> u32 { @@ -31,7 +31,7 @@ pub unsafe fn _lzcnt_u32(x: u32) -> u32 { } /// Counts the bits that are set. -#[inline(always)] +#[inline] #[target_feature(enable = "popcnt")] #[cfg_attr(test, assert_instr(popcnt))] pub unsafe fn _popcnt32(x: i32) -> i32 { diff --git a/library/stdarch/coresimd/src/x86/i586/avx.rs b/library/stdarch/coresimd/src/x86/i586/avx.rs index c21b9a0caf7d..cba133c734cc 100644 --- a/library/stdarch/coresimd/src/x86/i586/avx.rs +++ b/library/stdarch/coresimd/src/x86/i586/avx.rs @@ -26,7 +26,7 @@ use x86::*; /// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vaddpd))] pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d { @@ -35,7 +35,7 @@ pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d { /// Add packed single-precision (32-bit) floating-point elements in `a` and /// `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vaddps))] pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 { @@ -45,7 +45,7 @@ pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 { /// Compute the bitwise AND of a packed double-precision (64-bit) /// floating-point elements /// in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // FIXME: Should be 'vandpd' instuction. // See https://github.com/rust-lang-nursery/stdsimd/issues/71 @@ -58,7 +58,7 @@ pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d { /// Compute the bitwise AND of packed single-precision (32-bit) floating-point /// elements in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vandps))] pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 { @@ -69,7 +69,7 @@ pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 { /// Compute the bitwise OR packed double-precision (64-bit) floating-point /// elements in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // FIXME: Should be 'vorpd' instuction. // See https://github.com/rust-lang-nursery/stdsimd/issues/71 @@ -82,7 +82,7 @@ pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d { /// Compute the bitwise OR packed single-precision (32-bit) floating-point /// elements in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vorps))] pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 { @@ -93,7 +93,7 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 { /// Shuffle double-precision (64-bit) floating-point elements within 128-bit /// lanes using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))] pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { @@ -135,7 +135,7 @@ pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { /// Shuffle single-precision (32-bit) floating-point elements in `a` within /// 128-bit lanes using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))] pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { @@ -186,7 +186,7 @@ pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point /// elements in `a` /// and then AND with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // FIXME: Should be 'vandnpd' instruction. #[cfg_attr(test, assert_instr(vandnps))] @@ -199,7 +199,7 @@ pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d { /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point /// elements in `a` /// and then AND with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vandnps))] pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 { @@ -210,7 +210,7 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 { /// Compare packed double-precision (64-bit) floating-point elements /// in `a` and `b`, and return packed maximum values -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaxpd))] pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d { @@ -219,7 +219,7 @@ pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d { /// Compare packed single-precision (32-bit) floating-point elements in `a` /// and `b`, and return packed maximum values -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaxps))] pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 { @@ -228,7 +228,7 @@ pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 { /// Compare packed double-precision (64-bit) floating-point elements /// in `a` and `b`, and return packed minimum values -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vminpd))] pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d { @@ -237,7 +237,7 @@ pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d { /// Compare packed single-precision (32-bit) floating-point elements in `a` /// and `b`, and return packed minimum values -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vminps))] pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 { @@ -246,7 +246,7 @@ pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 { /// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmulpd))] pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d { @@ -255,7 +255,7 @@ pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d { /// Add packed single-precision (32-bit) floating-point elements in `a` and /// `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmulps))] pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { @@ -264,7 +264,7 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { /// Alternatively add and subtract packed double-precision (64-bit) /// floating-point elements in `a` to/from packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vaddsubpd))] pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { @@ -273,7 +273,7 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { /// Alternatively add and subtract packed single-precision (32-bit) /// floating-point elements in `a` to/from packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vaddsubps))] pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { @@ -282,7 +282,7 @@ pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { /// Subtract packed double-precision (64-bit) floating-point elements in `b` /// from packed elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vsubpd))] pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d { @@ -291,7 +291,7 @@ pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d { /// Subtract packed single-precision (32-bit) floating-point elements in `b` /// from packed elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vsubps))] pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 { @@ -300,7 +300,7 @@ pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 { /// Compute the division of each of the 8 packed 32-bit floating-point elements /// in `a` by the corresponding packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vdivps))] pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 { @@ -309,7 +309,7 @@ pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 { /// Compute the division of each of the 4 packed 64-bit floating-point elements /// in `a` by the corresponding packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vdivpd))] pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d { @@ -327,7 +327,7 @@ pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d { /// For a complete list of options, check [the LLVM docs][llvm_docs]. /// /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vroundpd, b = 0x3))] pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d { @@ -339,7 +339,7 @@ pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d { /// Round packed double-precision (64-bit) floating point elements in `a` /// toward positive infinity. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vroundpd))] pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d { @@ -348,7 +348,7 @@ pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d { /// Round packed double-precision (64-bit) floating point elements in `a` /// toward negative infinity. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vroundpd))] pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d { @@ -366,7 +366,7 @@ pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d { /// For a complete list of options, check [the LLVM docs][llvm_docs]. /// /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vroundps, b = 0x00))] pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 { @@ -380,7 +380,7 @@ pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 { /// Round packed single-precision (32-bit) floating point elements in `a` /// toward positive infinity. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vroundps))] pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 { @@ -389,7 +389,7 @@ pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 { /// Round packed single-precision (32-bit) floating point elements in `a` /// toward negative infinity. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vroundps))] pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 { @@ -398,7 +398,7 @@ pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 { /// Return the square root of packed single-precision (32-bit) floating point /// elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 { @@ -407,7 +407,7 @@ pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 { /// Return the square root of packed double-precision (64-bit) floating point /// elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d { @@ -416,7 +416,7 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d { /// Blend packed double-precision (64-bit) floating-point elements from /// `a` and `b` using control mask `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))] pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { @@ -458,7 +458,7 @@ pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { /// Blend packed single-precision (32-bit) floating-point elements from /// `a` and `b` using control mask `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))] pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { @@ -508,7 +508,7 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { /// Blend packed double-precision (64-bit) floating-point elements from /// `a` and `b` using `c` as a mask. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vblendvpd))] pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { @@ -517,7 +517,7 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { /// Blend packed single-precision (32-bit) floating-point elements from /// `a` and `b` using `c` as a mask. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vblendvps))] pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { @@ -528,7 +528,7 @@ pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { /// elements in `a` and `b` using the high 4 bits in `imm8`, /// sum the four products, and conditionally return the sum /// using the low 4 bits of `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))] pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { @@ -542,7 +542,7 @@ pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { /// of 4 64-bit floating points `a` and `b`. /// In the result, sums of elements from `a` are returned in even locations, /// while sums of elements from `b` are returned in odd locations. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vhaddpd))] pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d { @@ -554,7 +554,7 @@ pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d { /// In the result, sums of elements from `a` are returned in locations of /// indices 0, 1, 4, 5; while sums of elements from `b` are locations /// 2, 3, 6, 7. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vhaddps))] pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 { @@ -565,7 +565,7 @@ pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 { /// of 4 64-bit floating points `a` and `b`. /// In the result, sums of elements from `a` are returned in even locations, /// while sums of elements from `b` are returned in odd locations. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vhsubpd))] pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d { @@ -577,7 +577,7 @@ pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d { /// In the result, sums of elements from `a` are returned in locations of /// indices 0, 1, 4, 5; while sums of elements from `b` are locations /// 2, 3, 6, 7. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vhsubps))] pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 { @@ -586,7 +586,7 @@ pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 { /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point /// elements in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // FIXME Should be 'vxorpd' instruction. #[cfg_attr(test, assert_instr(vxorps))] @@ -598,7 +598,7 @@ pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d { /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point /// elements in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 { @@ -675,7 +675,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f; /// Compare packed double-precision (64-bit) floating-point /// elements in `a` and `b` based on the comparison operand /// specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { @@ -688,7 +688,7 @@ pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { /// Compare packed double-precision (64-bit) floating-point /// elements in `a` and `b` based on the comparison operand /// specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { @@ -701,7 +701,7 @@ pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { /// Compare packed single-precision (32-bit) floating-point /// elements in `a` and `b` based on the comparison operand /// specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse")] #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { @@ -714,7 +714,7 @@ pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { /// Compare packed single-precision (32-bit) floating-point /// elements in `a` and `b` based on the comparison operand /// specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { @@ -729,7 +729,7 @@ pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { /// store the result in the lower element of returned vector, /// and copy the upper element from `a` to the upper element of returned /// vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { @@ -744,7 +744,7 @@ pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { /// store the result in the lower element of returned vector, /// and copy the upper 3 packed elements from `a` to the upper elements of /// returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse")] #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 { @@ -756,7 +756,7 @@ pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 { /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvtdq2pd))] pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d { @@ -765,7 +765,7 @@ pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d { /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvtdq2ps))] pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 { @@ -774,7 +774,7 @@ pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 { /// Convert packed double-precision (64-bit) floating-point elements in `a` /// to packed single-precision (32-bit) floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvtpd2ps))] pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 { @@ -783,7 +783,7 @@ pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 { /// Convert packed single-precision (32-bit) floating-point elements in `a` /// to packed 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvtps2dq))] pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i { @@ -792,7 +792,7 @@ pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i { /// Convert packed single-precision (32-bit) floating-point elements in `a` /// to packed double-precision (64-bit) floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvtps2pd))] pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d { @@ -801,7 +801,7 @@ pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d { /// Convert packed double-precision (64-bit) floating-point elements in `a` /// to packed 32-bit integers with truncation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvttpd2dq))] pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i { @@ -810,7 +810,7 @@ pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i { /// Convert packed double-precision (64-bit) floating-point elements in `a` /// to packed 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvtpd2dq))] pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i { @@ -819,7 +819,7 @@ pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i { /// Convert packed single-precision (32-bit) floating-point elements in `a` /// to packed 32-bit integers with truncation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcvttps2dq))] pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i { @@ -828,7 +828,7 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i { /// Extract 128 bits (composed of 4 packed single-precision (32-bit) /// floating-point elements) from `a`, selected with `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vextractf128))] pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 { @@ -840,7 +840,7 @@ pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 { /// Extract 128 bits (composed of 2 packed double-precision (64-bit) /// floating-point elements) from `a`, selected with `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vextractf128))] pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d { @@ -851,7 +851,7 @@ pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d { } /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vextractf128))] pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i { @@ -864,7 +864,7 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i { } /// Zero the contents of all XMM or YMM registers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vzeroall))] pub unsafe fn _mm256_zeroall() { @@ -873,7 +873,7 @@ pub unsafe fn _mm256_zeroall() { /// Zero the upper 128 bits of all YMM registers; /// the lower 128-bits of the registers are unmodified. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vzeroupper))] pub unsafe fn _mm256_zeroupper() { @@ -882,7 +882,7 @@ pub unsafe fn _mm256_zeroupper() { /// Shuffle single-precision (32-bit) floating-point elements in `a` /// within 128-bit lanes using the control in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpermilps))] pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 { @@ -891,7 +891,7 @@ pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 { /// Shuffle single-precision (32-bit) floating-point elements in `a` /// using the control in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpermilps))] pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 { @@ -900,7 +900,7 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 { /// Shuffle single-precision (32-bit) floating-point elements in `a` /// within 128-bit lanes using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 { @@ -952,7 +952,7 @@ pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 { /// Shuffle single-precision (32-bit) floating-point elements in `a` /// using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse")] #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 { @@ -1005,7 +1005,7 @@ pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 { /// Shuffle double-precision (64-bit) floating-point elements in `a` /// within 256-bit lanes using the control in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpermilpd))] pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d { @@ -1014,7 +1014,7 @@ pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d { /// Shuffle double-precision (64-bit) floating-point elements in `a` /// using the control in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpermilpd))] pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d { @@ -1023,7 +1023,7 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d { /// Shuffle double-precision (64-bit) floating-point elements in `a` /// within 128-bit lanes using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d { @@ -1065,7 +1065,7 @@ pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d { /// Shuffle double-precision (64-bit) floating-point elements in `a` /// using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d { @@ -1091,7 +1091,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d { /// Shuffle 256-bits (composed of 8 packed single-precision (32-bit) /// floating-point elements) selected by `imm8` from `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))] pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { @@ -1103,7 +1103,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 /// Shuffle 256-bits (composed of 4 packed double-precision (64-bit) /// floating-point elements) selected by `imm8` from `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))] pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { @@ -1115,7 +1115,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m25 /// Shuffle 258-bits (composed of integer data) selected by `imm8` /// from `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))] pub unsafe fn _mm256_permute2f128_si256( @@ -1132,7 +1132,7 @@ pub unsafe fn _mm256_permute2f128_si256( /// Broadcast a single-precision (32-bit) floating-point element from memory /// to all elements of the returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 { @@ -1141,7 +1141,7 @@ pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 { /// Broadcast a single-precision (32-bit) floating-point element from memory /// to all elements of the returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 { @@ -1150,7 +1150,7 @@ pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 { /// Broadcast a double-precision (64-bit) floating-point element from memory /// to all elements of the returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vbroadcastsd))] pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d { @@ -1159,7 +1159,7 @@ pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d { /// Broadcast 128 bits from memory (composed of 4 packed single-precision /// (32-bit) floating-point elements) to all elements of the returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vbroadcastf128))] pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 { @@ -1168,7 +1168,7 @@ pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 { /// Broadcast 128 bits from memory (composed of 2 packed double-precision /// (64-bit) floating-point elements) to all elements of the returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vbroadcastf128))] pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d { @@ -1178,7 +1178,7 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d { /// Copy `a` to result, then insert 128 bits (composed of 4 packed /// single-precision (32-bit) floating-point elements) from `b` into result /// at the location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 { @@ -1192,7 +1192,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 { /// Copy `a` to result, then insert 128 bits (composed of 2 packed /// double-precision (64-bit) floating-point elements) from `b` into result /// at the location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d { @@ -1204,7 +1204,7 @@ pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d /// Copy `a` to result, then insert 128 bits from `b` into result /// at the location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] pub unsafe fn _mm256_insertf128_si256( @@ -1220,7 +1220,7 @@ pub unsafe fn _mm256_insertf128_si256( /// Copy `a` to result, and insert the 8-bit integer `i` into result /// at the location specified by `index`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i { @@ -1229,7 +1229,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i { /// Copy `a` to result, and insert the 16-bit integer `i` into result /// at the location specified by `index`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i { @@ -1238,7 +1238,7 @@ pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i { /// Copy `a` to result, and insert the 32-bit integer `i` into result /// at the location specified by `index`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i { @@ -1249,7 +1249,7 @@ pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i { /// floating-point elements) from memory into result. /// `mem_addr` must be aligned on a 32-byte boundary or a /// general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d { @@ -1260,7 +1260,7 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d { /// floating-point elements) from `a` into memory. /// `mem_addr` must be aligned on a 32-byte boundary or a /// general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) { @@ -1271,7 +1271,7 @@ pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) { /// floating-point elements) from memory into result. /// `mem_addr` must be aligned on a 32-byte boundary or a /// general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovaps))] pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 { @@ -1282,7 +1282,7 @@ pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 { /// floating-point elements) from `a` into memory. /// `mem_addr` must be aligned on a 32-byte boundary or a /// general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovaps))] pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) { @@ -1292,7 +1292,7 @@ pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) { /// Load 256-bits (composed of 4 packed double-precision (64-bit) /// floating-point elements) from memory into result. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d { @@ -1308,7 +1308,7 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d { /// Store 256-bits (composed of 4 packed double-precision (64-bit) /// floating-point elements) from `a` into memory. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) { @@ -1318,7 +1318,7 @@ pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) { /// Load 256-bits (composed of 8 packed single-precision (32-bit) /// floating-point elements) from memory into result. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovups))] pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 { @@ -1334,7 +1334,7 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 { /// Store 256-bits (composed of 8 packed single-precision (32-bit) /// floating-point elements) from `a` into memory. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovups))] pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) { @@ -1344,7 +1344,7 @@ pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) { /// Load 256-bits of integer data from memory into result. /// `mem_addr` must be aligned on a 32-byte boundary or a /// general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i { @@ -1354,7 +1354,7 @@ pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i { /// Store 256-bits of integer data from `a` into memory. /// `mem_addr` must be aligned on a 32-byte boundary or a /// general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) { @@ -1363,7 +1363,7 @@ pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) { /// Load 256-bits of integer data from memory into result. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i { @@ -1378,7 +1378,7 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i { /// Store 256-bits of integer data from `a` into memory. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { @@ -1388,7 +1388,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { /// Load packed double-precision (64-bit) floating-point elements from memory /// into result using `mask` (elements are zeroed out when the high bit of the /// corresponding element is not set). -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovpd))] pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d { @@ -1397,7 +1397,7 @@ pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d /// Store packed double-precision (64-bit) floating-point elements from `a` /// into memory using `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovpd))] pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) { @@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) /// Load packed double-precision (64-bit) floating-point elements from memory /// into result using `mask` (elements are zeroed out when the high bit of the /// corresponding element is not set). -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovpd))] pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d { @@ -1416,7 +1416,7 @@ pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d { /// Store packed double-precision (64-bit) floating-point elements from `a` /// into memory using `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovpd))] pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { @@ -1426,7 +1426,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { /// Load packed single-precision (32-bit) floating-point elements from memory /// into result using `mask` (elements are zeroed out when the high bit of the /// corresponding element is not set). -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovps))] pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 { @@ -1435,7 +1435,7 @@ pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 /// Store packed single-precision (32-bit) floating-point elements from `a` /// into memory using `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovps))] pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) { @@ -1445,7 +1445,7 @@ pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) /// Load packed single-precision (32-bit) floating-point elements from memory /// into result using `mask` (elements are zeroed out when the high bit of the /// corresponding element is not set). -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovps))] pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 { @@ -1454,7 +1454,7 @@ pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 { /// Store packed single-precision (32-bit) floating-point elements from `a` /// into memory using `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovps))] pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) { @@ -1463,7 +1463,7 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) { /// Duplicate odd-indexed single-precision (32-bit) floating-point elements /// from `a`, and return the results. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovshdup))] pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 { @@ -1472,7 +1472,7 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 { /// Duplicate even-indexed single-precision (32-bit) floating-point elements /// from `a`, and return the results. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovsldup))] pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 { @@ -1481,7 +1481,7 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 { /// Duplicate even-indexed double-precision (64-bit) floating-point elements /// from "a", and return the results. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovddup))] pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d { @@ -1491,7 +1491,7 @@ pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d { /// Load 256-bits of integer data from unaligned memory into result. /// This intrinsic may perform better than `_mm256_loadu_si256` when the /// data crosses a cache line boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vlddqu))] pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { @@ -1501,7 +1501,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { /// Moves integer data from a 256-bit integer vector to a 32-byte /// aligned memory location. To minimize caching, the data is flagged as /// non-temporal (unlikely to be used again soon) -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) { @@ -1511,7 +1511,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) { /// Moves double-precision values from a 256-bit vector of [4 x double] /// to a 32-byte aligned memory location. To minimize caching, the data is /// flagged as non-temporal (unlikely to be used again soon). -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) { @@ -1522,7 +1522,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) { /// of [8 x float] to a 32-byte aligned memory location. To minimize /// caching, the data is flagged as non-temporal (unlikely to be used again /// soon). -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovntps))] pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) { @@ -1532,7 +1532,7 @@ pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) { /// Compute the approximate reciprocal of packed single-precision (32-bit) /// floating-point elements in `a`, and return the results. The maximum /// relative error for this approximation is less than 1.5*2^-12. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vrcpps))] pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 { @@ -1542,7 +1542,7 @@ pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 { /// Compute the approximate reciprocal square root of packed single-precision /// (32-bit) floating-point elements in `a`, and return the results. /// The maximum relative error for this approximation is less than 1.5*2^-12. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vrsqrtps))] pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 { @@ -1551,7 +1551,7 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 { /// Unpack and interleave double-precision (64-bit) floating-point elements /// from the high half of each 128-bit lane in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vunpckhpd))] pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d { @@ -1560,7 +1560,7 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d { /// Unpack and interleave single-precision (32-bit) floating-point elements /// from the high half of each 128-bit lane in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vunpckhps))] pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 { @@ -1569,7 +1569,7 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 { /// Unpack and interleave double-precision (64-bit) floating-point elements /// from the low half of each 128-bit lane in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vunpcklpd))] pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d { @@ -1578,7 +1578,7 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d { /// Unpack and interleave single-precision (32-bit) floating-point elements /// from the low half of each 128-bit lane in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vunpcklps))] pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 { @@ -1589,7 +1589,7 @@ pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 { /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vptest))] pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 { @@ -1600,7 +1600,7 @@ pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 { /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if /// the result is zero, otherwise set `CF` to 0. Return the `CF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vptest))] pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 { @@ -1612,7 +1612,7 @@ pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 { /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if /// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and /// `CF` values are zero, otherwise return 0. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vptest))] pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 { @@ -1626,7 +1626,7 @@ pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `ZF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestpd))] pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 { @@ -1640,7 +1640,7 @@ pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `CF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestpd))] pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 { @@ -1655,7 +1655,7 @@ pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 { /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values /// are zero, otherwise return 0. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestpd))] pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 { @@ -1669,7 +1669,7 @@ pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `ZF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestpd))] pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 { @@ -1683,7 +1683,7 @@ pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `CF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestpd))] pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 { @@ -1698,7 +1698,7 @@ pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 { /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values /// are zero, otherwise return 0. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestpd))] pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 { @@ -1712,7 +1712,7 @@ pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `ZF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestps))] pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 { @@ -1726,7 +1726,7 @@ pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `CF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestps))] pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 { @@ -1741,7 +1741,7 @@ pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 { /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values /// are zero, otherwise return 0. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestps))] pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 { @@ -1755,7 +1755,7 @@ pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `ZF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestps))] pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 { @@ -1769,7 +1769,7 @@ pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 { /// NOT of `a` and then AND with `b`, producing an intermediate value, and set /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return the `CF` value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestps))] pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 { @@ -1784,7 +1784,7 @@ pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 { /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values /// are zero, otherwise return 0. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vtestps))] pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { @@ -1794,7 +1794,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { /// Set each bit of the returned mask based on the most significant bit of the /// corresponding packed double-precision (64-bit) floating-point element in /// `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovmskpd))] pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { @@ -1804,7 +1804,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { /// Set each bit of the returned mask based on the most significant bit of the /// corresponding packed single-precision (32-bit) floating-point element in /// `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmovmskps))] pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 { @@ -1812,7 +1812,7 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 { } /// Return vector of type __m256d with all elements set to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected pub unsafe fn _mm256_setzero_pd() -> __m256d { @@ -1820,7 +1820,7 @@ pub unsafe fn _mm256_setzero_pd() -> __m256d { } /// Return vector of type __m256 with all elements set to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm256_setzero_ps() -> __m256 { @@ -1828,7 +1828,7 @@ pub unsafe fn _mm256_setzero_ps() -> __m256 { } /// Return vector of type __m256i with all elements set to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vxor))] pub unsafe fn _mm256_setzero_si256() -> __m256i { @@ -1837,7 +1837,7 @@ pub unsafe fn _mm256_setzero_si256() -> __m256i { /// Set packed double-precision (64-bit) floating-point elements in returned /// vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[cfg_attr(test, assert_instr(vinsertf128))] @@ -1847,7 +1847,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { /// Set packed single-precision (32-bit) floating-point elements in returned /// vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set_ps( @@ -1858,7 +1858,7 @@ pub unsafe fn _mm256_set_ps( /// Set packed 8-bit integers in returned vector with the supplied values in /// reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set_epi8( @@ -1877,7 +1877,7 @@ pub unsafe fn _mm256_set_epi8( } /// Set packed 16-bit integers in returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set_epi16( @@ -1895,7 +1895,7 @@ pub unsafe fn _mm256_set_epi16( } /// Set packed 32-bit integers in returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set_epi32( @@ -1905,7 +1905,7 @@ pub unsafe fn _mm256_set_epi32( } /// Set packed 64-bit integers in returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[cfg_attr(test, assert_instr(vinsertf128))] @@ -1915,7 +1915,7 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i { /// Set packed double-precision (64-bit) floating-point elements in returned /// vector with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { @@ -1924,7 +1924,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { /// Set packed single-precision (32-bit) floating-point elements in returned /// vector with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_setr_ps( @@ -1935,7 +1935,7 @@ pub unsafe fn _mm256_setr_ps( /// Set packed 8-bit integers in returned vector with the supplied values in /// reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_setr_epi8( @@ -1955,7 +1955,7 @@ pub unsafe fn _mm256_setr_epi8( /// Set packed 16-bit integers in returned vector with the supplied values in /// reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_setr_epi16( @@ -1974,7 +1974,7 @@ pub unsafe fn _mm256_setr_epi16( /// Set packed 32-bit integers in returned vector with the supplied values in /// reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_setr_epi32( @@ -1985,7 +1985,7 @@ pub unsafe fn _mm256_setr_epi32( /// Set packed 64-bit integers in returned vector with the supplied values in /// reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[cfg_attr(test, assert_instr(vinsertf128))] @@ -1995,7 +1995,7 @@ pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i { /// Broadcast double-precision (64-bit) floating-point value `a` to all /// elements of returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d { @@ -2004,7 +2004,7 @@ pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d { /// Broadcast single-precision (32-bit) floating-point value `a` to all /// elements of returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 { @@ -2013,7 +2013,7 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 { /// Broadcast 8-bit integer `a` to all elements of returned vector. /// This intrinsic may generate the `vpbroadcastb`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vpshufb))] #[cfg_attr(test, assert_instr(vinsertf128))] @@ -2030,7 +2030,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i { /// Broadcast 16-bit integer `a` to all all elements of returned vector. /// This intrinsic may generate the `vpbroadcastw`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] //#[cfg_attr(test, assert_instr(vpshufb))] #[cfg_attr(test, assert_instr(vinsertf128))] @@ -2041,7 +2041,7 @@ pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i { /// Broadcast 32-bit integer `a` to all elements of returned vector. /// This intrinsic may generate the `vpbroadcastd`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i { @@ -2050,7 +2050,7 @@ pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i { /// Broadcast 64-bit integer `a` to all elements of returned vector. /// This intrinsic may generate the `vpbroadcastq`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] //#[cfg_attr(test, assert_instr(vmovddup))] #[cfg_attr(test, assert_instr(vinsertf128))] @@ -2060,7 +2060,7 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i { } /// Cast vector of type __m256d to type __m256. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2069,7 +2069,7 @@ pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 { } /// Cast vector of type __m256 to type __m256d. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2078,7 +2078,7 @@ pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d { } /// Casts vector of type __m256 to type __m256i. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2087,7 +2087,7 @@ pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i { } /// Casts vector of type __m256i to type __m256. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2096,7 +2096,7 @@ pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 { } /// Casts vector of type __m256d to type __m256i. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2105,7 +2105,7 @@ pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i { } /// Casts vector of type __m256i to type __m256d. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2114,7 +2114,7 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d { } /// Casts vector of type __m256 to type __m128. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2123,7 +2123,7 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 { } /// Casts vector of type __m256d to type __m128d. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2132,7 +2132,7 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d { } /// Casts vector of type __m256i to type __m128i. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2144,7 +2144,7 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i { /// Casts vector of type __m128 to type __m256; /// the upper 128 bits of the result are undefined. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2155,7 +2155,7 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 { /// Casts vector of type __m128d to type __m256d; /// the upper 128 bits of the result are undefined. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2166,7 +2166,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d { /// Casts vector of type __m128i to type __m256i; /// the upper 128 bits of the result are undefined. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2180,7 +2180,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i { /// Constructs a 256-bit floating-point vector of [8 x float] from a /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain /// the value of the source vector. The upper 128 bits are set to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2191,7 +2191,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 { /// Constructs a 256-bit integer vector from a 128-bit integer vector. /// The lower 128 bits contain the value of the source vector. The upper /// 128 bits are set to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2205,7 +2205,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i { /// 128-bit floating-point vector of [2 x double]. The lower 128 bits /// contain the value of the source vector. The upper 128 bits are set /// to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. @@ -2214,7 +2214,7 @@ pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d { } /// Return vector of type `__m256` with undefined elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_undefined_ps() -> __m256 { @@ -2222,7 +2222,7 @@ pub unsafe fn _mm256_undefined_ps() -> __m256 { } /// Return vector of type `__m256d` with undefined elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_undefined_pd() -> __m256d { @@ -2230,7 +2230,7 @@ pub unsafe fn _mm256_undefined_pd() -> __m256d { } /// Return vector of type __m256i with undefined elements. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_undefined_si256() -> __m256i { @@ -2238,7 +2238,7 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i { } /// Set packed __m256 returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 { @@ -2246,7 +2246,7 @@ pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 { } /// Set packed __m256d returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d { @@ -2256,7 +2256,7 @@ pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d { } /// Set packed __m256i returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i { @@ -2266,7 +2266,7 @@ pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i { } /// Set packed __m256 returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 { @@ -2274,7 +2274,7 @@ pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 { } /// Set packed __m256d returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d { @@ -2282,7 +2282,7 @@ pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d { } /// Set packed __m256i returned vector with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128))] pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { @@ -2293,7 +2293,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { /// floating-point elements) from memory, and combine them into a 256-bit /// value. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_loadu2_m128( @@ -2307,7 +2307,7 @@ pub unsafe fn _mm256_loadu2_m128( /// floating-point elements) from memory, and combine them into a 256-bit /// value. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_loadu2_m128d( @@ -2320,7 +2320,7 @@ pub unsafe fn _mm256_loadu2_m128d( /// Load two 128-bit values (composed of integer data) from memory, and combine /// them into a 256-bit value. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_loadu2_m128i( @@ -2335,7 +2335,7 @@ pub unsafe fn _mm256_loadu2_m128i( /// single-precision (32-bit) floating-point elements) from `a` into memory two /// different 128-bit locations. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_storeu2_m128( @@ -2351,7 +2351,7 @@ pub unsafe fn _mm256_storeu2_m128( /// double-precision (64-bit) floating-point elements) from `a` into memory two /// different 128-bit locations. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_storeu2_m128d( @@ -2366,7 +2366,7 @@ pub unsafe fn _mm256_storeu2_m128d( /// Store the high and low 128-bit halves (each composed of integer data) from /// `a` into memory two different 128-bit locations. /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "avx,sse2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_storeu2_m128i( @@ -2380,7 +2380,7 @@ pub unsafe fn _mm256_storeu2_m128i( } /// Returns the first element of the input vector of [8 x float]. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] //#[cfg_attr(test, assert_instr(movss))] FIXME pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 { diff --git a/library/stdarch/coresimd/src/x86/i586/avx2.rs b/library/stdarch/coresimd/src/x86/i586/avx2.rs index 72892913bbfa..540031009eae 100644 --- a/library/stdarch/coresimd/src/x86/i586/avx2.rs +++ b/library/stdarch/coresimd/src/x86/i586/avx2.rs @@ -31,7 +31,7 @@ use x86::*; use stdsimd_test::assert_instr; /// Computes the absolute values of packed 32-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsd))] pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i { @@ -39,7 +39,7 @@ pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i { } /// Computes the absolute values of packed 16-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsw))] pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i { @@ -47,7 +47,7 @@ pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i { } /// Computes the absolute values of packed 8-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsb))] pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i { @@ -55,7 +55,7 @@ pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i { } /// Add packed 64-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddq))] pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { @@ -63,7 +63,7 @@ pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { } /// Add packed 32-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddd))] pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -71,7 +71,7 @@ pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { } /// Add packed 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddw))] pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -79,7 +79,7 @@ pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Add packed 8-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddb))] pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -87,7 +87,7 @@ pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { } /// Add packed 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -95,7 +95,7 @@ pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { } /// Add packed 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddsw))] pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -103,7 +103,7 @@ pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { @@ -111,7 +111,7 @@ pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddusw))] pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { @@ -120,7 +120,7 @@ pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary /// result, shift the result right by `n` bytes, and return the low 16 bytes. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpalignr, n = 15))] pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i { @@ -187,7 +187,7 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i { /// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vandps))] pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { @@ -196,7 +196,7 @@ pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { /// Compute the bitwise NOT of 256 bits (representing integer data) /// in `a` and then AND with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vandnps))] pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { @@ -205,7 +205,7 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { } /// Average packed unsigned 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpavgw))] pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { @@ -213,7 +213,7 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { } /// Average packed unsigned 8-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpavgb))] pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { @@ -221,7 +221,7 @@ pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { } /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))] pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { @@ -253,7 +253,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { } /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))] pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { @@ -305,7 +305,7 @@ pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { } /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))] pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { @@ -359,7 +359,7 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { } /// Blend packed 8-bit integers from `a` and `b` using `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendvb))] pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { @@ -368,7 +368,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25 /// Broadcast the low packed 8-bit integer from `a` to all elements of /// the 128-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastb))] pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { @@ -379,7 +379,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { /// Broadcast the low packed 8-bit integer from `a` to all elements of /// the 256-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastb))] pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { @@ -392,7 +392,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { // often compiled to vbroadcastss. /// Broadcast the low packed 32-bit integer from `a` to all elements of /// the 128-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { @@ -405,7 +405,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { // often compiled to vbroadcastss. /// Broadcast the low packed 32-bit integer from `a` to all elements of /// the 256-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { @@ -416,7 +416,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { /// Broadcast the low packed 64-bit integer from `a` to all elements of /// the 128-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastq))] pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { @@ -429,7 +429,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { // often compiled to vbroadcastsd. /// Broadcast the low packed 64-bit integer from `a` to all elements of /// the 256-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastsd))] pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { @@ -440,7 +440,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { /// Broadcast the low double-precision (64-bit) floating-point element /// from `a` to all elements of the 128-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vmovddup))] pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { @@ -449,7 +449,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { /// Broadcast the low double-precision (64-bit) floating-point element /// from `a` to all elements of the 256-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastsd))] pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { @@ -460,7 +460,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { // vbroadcastf128. /// Broadcast 128 bits of integer data from a to all 128-bit lanes in /// the 256-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { let zero = _mm_setzero_si128(); @@ -470,7 +470,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { /// Broadcast the low single-precision (32-bit) floating-point element /// from `a` to all elements of the 128-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 { @@ -479,7 +479,7 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 { /// Broadcast the low single-precision (32-bit) floating-point element /// from `a` to all elements of the 256-bit returned value. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 { @@ -488,7 +488,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 { /// Broadcast the low packed 16-bit integer from a to all elements of /// the 128-bit returned value -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastw))] pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { @@ -499,7 +499,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { /// Broadcast the low packed 16-bit integer from a to all elements of /// the 256-bit returned value -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastw))] pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { @@ -509,7 +509,7 @@ pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { } /// Compare packed 64-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqq))] pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { @@ -517,7 +517,7 @@ pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 32-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqd))] pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -525,7 +525,7 @@ pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 16-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqw))] pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -533,7 +533,7 @@ pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 8-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqb))] pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -541,7 +541,7 @@ pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 64-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtq))] pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { @@ -549,7 +549,7 @@ pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 32-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtd))] pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -557,7 +557,7 @@ pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 16-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtw))] pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -565,7 +565,7 @@ pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Compare packed 8-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtb))] pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -573,7 +573,7 @@ pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { } /// Sign-extend 16-bit integers to 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxwd))] pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { @@ -581,7 +581,7 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { } /// Sign-extend 16-bit integers to 64-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxwq))] pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { @@ -591,7 +591,7 @@ pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { } /// Sign-extend 32-bit integers to 64-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxdq))] pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { @@ -599,7 +599,7 @@ pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { } /// Sign-extend 8-bit integers to 16-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbw))] pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { @@ -607,7 +607,7 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { } /// Sign-extend 8-bit integers to 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbd))] pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { @@ -617,7 +617,7 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { } /// Sign-extend 8-bit integers to 64-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbq))] pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { @@ -628,7 +628,7 @@ pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit /// integers, and store the results in dst. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxwd))] pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { @@ -637,7 +637,7 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit /// integers. The upper four elements of `a` are unused. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxwq))] pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { @@ -647,7 +647,7 @@ pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { } /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxdq))] pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { @@ -655,7 +655,7 @@ pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { } /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbw))] pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { @@ -664,7 +664,7 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit /// integers. The upper eight elements of `a` are unused. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbd))] pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { @@ -675,7 +675,7 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit /// integers. The upper twelve elements of `a` are unused. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbq))] pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { @@ -685,7 +685,7 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { } /// Extract 128 bits (of integer data) from `a` selected with `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))] pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i { @@ -699,7 +699,7 @@ pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i { } /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddw))] pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -707,7 +707,7 @@ pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddd))] pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -716,7 +716,7 @@ pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddsw))] pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -724,7 +724,7 @@ pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubw))] pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -732,7 +732,7 @@ pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubd))] pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -741,7 +741,7 @@ pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubsw))] pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -751,7 +751,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm_i32gather_epi32( @@ -772,7 +772,7 @@ pub unsafe fn _mm_i32gather_epi32( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm_mask_i32gather_epi32( @@ -792,7 +792,7 @@ pub unsafe fn _mm_mask_i32gather_epi32( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm256_i32gather_epi32( @@ -813,7 +813,7 @@ pub unsafe fn _mm256_i32gather_epi32( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm256_mask_i32gather_epi32( @@ -833,7 +833,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm_i32gather_ps( @@ -853,7 +853,7 @@ pub unsafe fn _mm_i32gather_ps( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm_mask_i32gather_ps( @@ -870,7 +870,7 @@ pub unsafe fn _mm_mask_i32gather_ps( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm256_i32gather_ps( @@ -890,7 +890,7 @@ pub unsafe fn _mm256_i32gather_ps( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm256_mask_i32gather_ps( @@ -907,7 +907,7 @@ pub unsafe fn _mm256_mask_i32gather_ps( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm_i32gather_epi64( @@ -928,7 +928,7 @@ pub unsafe fn _mm_i32gather_epi64( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm_mask_i32gather_epi64( @@ -948,7 +948,7 @@ pub unsafe fn _mm_mask_i32gather_epi64( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm256_i32gather_epi64( @@ -969,7 +969,7 @@ pub unsafe fn _mm256_i32gather_epi64( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm256_mask_i32gather_epi64( @@ -989,7 +989,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm_i32gather_pd( @@ -1009,7 +1009,7 @@ pub unsafe fn _mm_i32gather_pd( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm_mask_i32gather_pd( @@ -1026,7 +1026,7 @@ pub unsafe fn _mm_mask_i32gather_pd( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm256_i32gather_pd( @@ -1046,7 +1046,7 @@ pub unsafe fn _mm256_i32gather_pd( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm256_mask_i32gather_pd( @@ -1063,7 +1063,7 @@ pub unsafe fn _mm256_mask_i32gather_pd( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm_i64gather_epi32( @@ -1084,7 +1084,7 @@ pub unsafe fn _mm_i64gather_epi32( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm_mask_i64gather_epi32( @@ -1104,7 +1104,7 @@ pub unsafe fn _mm_mask_i64gather_epi32( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm256_i64gather_epi32( @@ -1125,7 +1125,7 @@ pub unsafe fn _mm256_i64gather_epi32( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm256_mask_i64gather_epi32( @@ -1145,7 +1145,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm_i64gather_ps( @@ -1165,7 +1165,7 @@ pub unsafe fn _mm_i64gather_ps( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm_mask_i64gather_ps( @@ -1182,7 +1182,7 @@ pub unsafe fn _mm_mask_i64gather_ps( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm256_i64gather_ps( @@ -1202,7 +1202,7 @@ pub unsafe fn _mm256_i64gather_ps( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm256_mask_i64gather_ps( @@ -1219,7 +1219,7 @@ pub unsafe fn _mm256_mask_i64gather_ps( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm_i64gather_epi64( @@ -1240,7 +1240,7 @@ pub unsafe fn _mm_i64gather_epi64( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm_mask_i64gather_epi64( @@ -1260,7 +1260,7 @@ pub unsafe fn _mm_mask_i64gather_epi64( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm256_i64gather_epi64( @@ -1281,7 +1281,7 @@ pub unsafe fn _mm256_i64gather_epi64( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm256_mask_i64gather_epi64( @@ -1301,7 +1301,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm_i64gather_pd( @@ -1321,7 +1321,7 @@ pub unsafe fn _mm_i64gather_pd( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm_mask_i64gather_pd( @@ -1338,7 +1338,7 @@ pub unsafe fn _mm_mask_i64gather_pd( /// Return values from `slice` at offsets determined by `offsets * scale`, /// where /// `scale` is between 1 and 8. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm256_i64gather_pd( @@ -1358,7 +1358,7 @@ pub unsafe fn _mm256_i64gather_pd( /// where /// `scale` is between 1 and 8. If mask is set, load the value from `src` in /// that position instead. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm256_mask_i64gather_pd( @@ -1374,7 +1374,7 @@ pub unsafe fn _mm256_mask_i64gather_pd( /// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the /// location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] pub unsafe fn _mm256_inserti128_si256( @@ -1392,7 +1392,7 @@ pub unsafe fn _mm256_inserti128_si256( /// Multiply packed signed 16-bit integers in `a` and `b`, producing /// intermediate signed 32-bit integers. Horizontally add adjacent pairs /// of intermediate 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddwd))] pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1403,7 +1403,7 @@ pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { /// corresponding signed 8-bit integer from `b`, producing intermediate /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate /// signed 16-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddubsw))] pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1413,7 +1413,7 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask` /// (elements are zeroed out when the highest bit is not set in the /// corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i { @@ -1423,7 +1423,7 @@ pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask` /// (elements are zeroed out when the highest bit is not set in the /// corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] pub unsafe fn _mm256_maskload_epi32( @@ -1435,7 +1435,7 @@ pub unsafe fn _mm256_maskload_epi32( /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask` /// (elements are zeroed out when the highest bit is not set in the /// corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i { @@ -1445,7 +1445,7 @@ pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask` /// (elements are zeroed out when the highest bit is not set in the /// corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] pub unsafe fn _mm256_maskload_epi64( @@ -1457,7 +1457,7 @@ pub unsafe fn _mm256_maskload_epi64( /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr` /// using `mask` (elements are not stored when the highest bit is not set /// in the corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) { @@ -1467,7 +1467,7 @@ pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr` /// using `mask` (elements are not stored when the highest bit is not set /// in the corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] pub unsafe fn _mm256_maskstore_epi32( @@ -1479,7 +1479,7 @@ pub unsafe fn _mm256_maskstore_epi32( /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr` /// using `mask` (elements are not stored when the highest bit is not set /// in the corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) { @@ -1489,7 +1489,7 @@ pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr` /// using `mask` (elements are not stored when the highest bit is not set /// in the corresponding element). -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] pub unsafe fn _mm256_maskstore_epi64( @@ -1500,7 +1500,7 @@ pub unsafe fn _mm256_maskstore_epi64( /// Compare packed 16-bit integers in `a` and `b`, and return the packed /// maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsw))] pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1509,7 +1509,7 @@ pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { /// Compare packed 32-bit integers in `a` and `b`, and return the packed /// maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsd))] pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1518,7 +1518,7 @@ pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { /// Compare packed 8-bit integers in `a` and `b`, and return the packed /// maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsb))] pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -1527,7 +1527,7 @@ pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { /// Compare packed unsigned 16-bit integers in `a` and `b`, and return /// the packed maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxuw))] pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { @@ -1536,7 +1536,7 @@ pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { /// Compare packed unsigned 32-bit integers in `a` and `b`, and return /// the packed maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxud))] pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { @@ -1545,7 +1545,7 @@ pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { /// Compare packed unsigned 8-bit integers in `a` and `b`, and return /// the packed maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxub))] pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { @@ -1554,7 +1554,7 @@ pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { /// Compare packed 16-bit integers in `a` and `b`, and return the packed /// minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsw))] pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1563,7 +1563,7 @@ pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { /// Compare packed 32-bit integers in `a` and `b`, and return the packed /// minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsd))] pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1572,7 +1572,7 @@ pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { /// Compare packed 8-bit integers in `a` and `b`, and return the packed /// minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsb))] pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -1581,7 +1581,7 @@ pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { /// Compare packed unsigned 16-bit integers in `a` and `b`, and return /// the packed minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminuw))] pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { @@ -1590,7 +1590,7 @@ pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { /// Compare packed unsigned 32-bit integers in `a` and `b`, and return /// the packed minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminud))] pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { @@ -1599,7 +1599,7 @@ pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { /// Compare packed unsigned 8-bit integers in `a` and `b`, and return /// the packed minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminub))] pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { @@ -1608,7 +1608,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { /// Create mask from the most significant bit of each 8-bit element in `a`, /// return the result. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovmskb))] pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { @@ -1622,7 +1622,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { /// selected from `b` starting at on the offset specified in `imm8`. Eight /// quadruplets are formed from sequential 8-bit integers selected from `a` /// starting at the offset specified in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))] pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i { @@ -1639,7 +1639,7 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i /// `a` and `b` /// /// Return the 64-bit results. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmuldq))] pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1650,7 +1650,7 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { /// element in `a` and `b` /// /// Return the unsigned 64-bit results. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmuludq))] pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { @@ -1660,7 +1660,7 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { /// Multiply the packed 16-bit integers in `a` and `b`, producing /// intermediate 32-bit integers and returning the high 16 bits of the /// intermediate integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhw))] pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1670,7 +1670,7 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing /// intermediate 32-bit integers and returning the high 16 bits of the /// intermediate integers. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhuw))] pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { @@ -1680,7 +1680,7 @@ pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { /// Multiply the packed 16-bit integers in `a` and `b`, producing /// intermediate 32-bit integers, and return the low 16 bits of the /// intermediate integers -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmullw))] pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1690,7 +1690,7 @@ pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { /// Multiply the packed 32-bit integers in `a` and `b`, producing /// intermediate 64-bit integers, and return the low 16 bits of the /// intermediate integers -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulld))] pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1701,7 +1701,7 @@ pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { /// intermediate signed 32-bit integers. Truncate each intermediate /// integer to the 18 most significant bits, round by adding 1, and /// return bits [16:1] -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhrsw))] pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1710,7 +1710,7 @@ pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { /// Compute the bitwise OR of 256 bits (representing integer data) in `a` /// and `b` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vorps))] pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { @@ -1719,7 +1719,7 @@ pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using signed saturation -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpacksswb))] pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1728,7 +1728,7 @@ pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers /// using signed saturation -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackssdw))] pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1737,7 +1737,7 @@ pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using unsigned saturation -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackuswb))] pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -1746,7 +1746,7 @@ pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers /// using unsigned saturation -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackusdw))] pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1757,7 +1757,7 @@ pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { /// /// The last 3 bits of each integer of `b` are used as addresses into the 8 /// integers of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermd))] pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -1765,7 +1765,7 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { } /// Permutes 64-bit integers from `a` using control mask `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermq, imm8 = 9))] pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i { @@ -1817,7 +1817,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i { } /// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))] pub unsafe fn _mm256_permute2x128_si256( @@ -1835,7 +1835,7 @@ pub unsafe fn _mm256_permute2x128_si256( /// Shuffle 64-bit floating-point elements in `a` across lanes using the /// control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))] pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d { @@ -1887,7 +1887,7 @@ pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d { /// Shuffle eight 32-bit foating-point elements in `a` across lanes using /// the corresponding 32-bit integer index in `idx`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermps))] pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { @@ -1898,7 +1898,7 @@ pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { /// and `b`, then horizontally sum each consecutive 8 differences to /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit /// integers in the low 16 bits of the 64-bit return value -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsadbw))] pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { @@ -1934,7 +1934,7 @@ pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { /// r /// } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufb))] pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -1974,7 +1974,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i { @@ -2035,7 +2035,7 @@ pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i { /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied /// to the output. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))] pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i { @@ -2092,7 +2092,7 @@ pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i { /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied /// to the output. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))] pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i { @@ -2149,7 +2149,7 @@ pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i { /// Negate packed 16-bit integers in `a` when the corresponding signed /// 16-bit integer in `b` is negative, and return the results. /// Results are zeroed out when the corresponding element in `b` is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignw))] pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -2159,7 +2159,7 @@ pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { /// Negate packed 32-bit integers in `a` when the corresponding signed /// 32-bit integer in `b` is negative, and return the results. /// Results are zeroed out when the corresponding element in `b` is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignd))] pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -2169,7 +2169,7 @@ pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { /// Negate packed 8-bit integers in `a` when the corresponding signed /// 8-bit integer in `b` is negative, and return the results. /// Results are zeroed out when the corresponding element in `b` is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignb))] pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -2178,7 +2178,7 @@ pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { /// Shift packed 16-bit integers in `a` left by `count` while /// shifting in zeros, and return the result -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllw))] pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { @@ -2187,7 +2187,7 @@ pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 32-bit integers in `a` left by `count` while /// shifting in zeros, and return the result -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslld))] pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { @@ -2196,7 +2196,7 @@ pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 64-bit integers in `a` left by `count` while /// shifting in zeros, and return the result -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllq))] pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { @@ -2205,7 +2205,7 @@ pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 16-bit integers in `a` left by `imm8` while /// shifting in zeros, return the results; -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllw))] pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i { @@ -2214,7 +2214,7 @@ pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 32-bit integers in `a` left by `imm8` while /// shifting in zeros, return the results; -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslld))] pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i { @@ -2223,7 +2223,7 @@ pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 64-bit integers in `a` left by `imm8` while /// shifting in zeros, return the results; -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllq))] pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i { @@ -2231,7 +2231,7 @@ pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i { } /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))] pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i { @@ -2245,7 +2245,7 @@ pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i { } /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))] pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i { @@ -2255,7 +2255,7 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 32-bit integers in `a` left by the amount /// specified by the corresponding element in `count` while /// shifting in zeros, and return the result. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvd))] pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { @@ -2265,7 +2265,7 @@ pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 32-bit integers in `a` left by the amount /// specified by the corresponding element in `count` while /// shifting in zeros, and return the result. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvd))] pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { @@ -2275,7 +2275,7 @@ pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { /// Shift packed 64-bit integers in `a` left by the amount /// specified by the corresponding element in `count` while /// shifting in zeros, and return the result. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvq))] pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { @@ -2285,7 +2285,7 @@ pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 64-bit integers in `a` left by the amount /// specified by the corresponding element in `count` while /// shifting in zeros, and return the result. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvq))] pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { @@ -2294,7 +2294,7 @@ pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { /// Shift packed 16-bit integers in `a` right by `count` while /// shifting in sign bits. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsraw))] pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { @@ -2303,7 +2303,7 @@ pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 32-bit integers in `a` right by `count` while /// shifting in sign bits. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrad))] pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { @@ -2312,7 +2312,7 @@ pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 16-bit integers in `a` right by `imm8` while /// shifting in sign bits. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsraw))] pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i { @@ -2321,7 +2321,7 @@ pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 32-bit integers in `a` right by `imm8` while /// shifting in sign bits. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrad))] pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i { @@ -2330,7 +2330,7 @@ pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 32-bit integers in `a` right by the amount specified by the /// corresponding element in `count` while shifting in sign bits. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsravd))] pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { @@ -2339,7 +2339,7 @@ pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 32-bit integers in `a` right by the amount specified by the /// corresponding element in `count` while shifting in sign bits. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsravd))] pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { @@ -2347,7 +2347,7 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { } /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))] pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i { @@ -2361,7 +2361,7 @@ pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i { } /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))] pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i { @@ -2370,7 +2370,7 @@ pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 16-bit integers in `a` right by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlw))] pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { @@ -2379,7 +2379,7 @@ pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 32-bit integers in `a` right by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrld))] pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { @@ -2388,7 +2388,7 @@ pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 64-bit integers in `a` right by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlq))] pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { @@ -2397,7 +2397,7 @@ pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in /// zeros -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlw))] pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i { @@ -2406,7 +2406,7 @@ pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in /// zeros -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrld))] pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i { @@ -2415,7 +2415,7 @@ pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in /// zeros -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlq))] pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i { @@ -2424,7 +2424,7 @@ pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i { /// Shift packed 32-bit integers in `a` right by the amount specified by /// the corresponding element in `count` while shifting in zeros, -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvd))] pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { @@ -2433,7 +2433,7 @@ pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 32-bit integers in `a` right by the amount specified by /// the corresponding element in `count` while shifting in zeros, -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvd))] pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { @@ -2442,7 +2442,7 @@ pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { /// Shift packed 64-bit integers in `a` right by the amount specified by /// the corresponding element in `count` while shifting in zeros, -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvq))] pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { @@ -2451,7 +2451,7 @@ pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 64-bit integers in `a` right by the amount specified by /// the corresponding element in `count` while shifting in zeros, -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvq))] pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { @@ -2461,7 +2461,7 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { // TODO _mm256_stream_load_si256 (__m256i const* mem_addr) /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubw))] pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -2469,7 +2469,7 @@ pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { } /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubd))] pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -2477,7 +2477,7 @@ pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { } /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubq))] pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { @@ -2485,7 +2485,7 @@ pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { } /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubb))] pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -2494,7 +2494,7 @@ pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in /// `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubsw))] pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -2503,7 +2503,7 @@ pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in /// `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -2512,7 +2512,7 @@ pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit /// integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubusw))] pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { @@ -2521,7 +2521,7 @@ pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit /// integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { @@ -2560,7 +2560,7 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhbw))] pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -2605,7 +2605,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklbw))] pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { @@ -2648,7 +2648,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhwd))] pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -2689,7 +2689,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklwd))] pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { @@ -2729,7 +2729,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhdq))] pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -2765,7 +2765,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckldq))] pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { @@ -2801,7 +2801,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhqdq))] pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { @@ -2837,7 +2837,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklqdq))] pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { @@ -2847,7 +2847,7 @@ pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { /// Compute the bitwise XOR of 256 bits (representing integer data) /// in `a` and `b` -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { @@ -2858,7 +2858,7 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { /// integer containing the zero-extended integer data. /// /// See [LLVM commit D20468][https://reviews.llvm.org/D20468]. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 { @@ -2870,7 +2870,7 @@ pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 { /// integer containing the zero-extended integer data. /// /// See [LLVM commit D20468][https://reviews.llvm.org/D20468]. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 { @@ -2879,7 +2879,7 @@ pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 { } /// Extract a 32-bit integer from `a`, selected with `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 { @@ -2888,7 +2888,7 @@ pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 { } /// Returns the first element of the input vector of [4 x double]. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] //#[cfg_attr(test, assert_instr(movsd))] FIXME pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 { @@ -2896,7 +2896,7 @@ pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 { } /// Returns the first element of the input vector of [8 x i32]. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] //#[cfg_attr(test, assert_instr(movd))] FIXME pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 { diff --git a/library/stdarch/coresimd/src/x86/i586/bmi.rs b/library/stdarch/coresimd/src/x86/i586/bmi.rs index 9b3eee2aa02e..512695e049cf 100644 --- a/library/stdarch/coresimd/src/x86/i586/bmi.rs +++ b/library/stdarch/coresimd/src/x86/i586/bmi.rs @@ -14,7 +14,7 @@ use stdsimd_test::assert_instr; /// Extracts bits in range [`start`, `start` + `length`) from `a` into /// the least significant bits of the result. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(bextr))] pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { @@ -26,7 +26,7 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { /// /// Bits [7,0] of `control` specify the index to the first bit in the range to /// be extracted, and bits [15,8] specify the length of the range. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(bextr))] pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 { @@ -34,7 +34,7 @@ pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 { } /// Bitwise logical `AND` of inverted `a` with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(andn))] pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 { @@ -42,7 +42,7 @@ pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 { } /// Extract lowest set isolated bit. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(blsi))] pub unsafe fn _blsi_u32(x: u32) -> u32 { @@ -50,7 +50,7 @@ pub unsafe fn _blsi_u32(x: u32) -> u32 { } /// Get mask up to lowest set bit. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(blsmsk))] pub unsafe fn _blsmsk_u32(x: u32) -> u32 { @@ -60,7 +60,7 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 { /// Resets the lowest set bit of `x`. /// /// If `x` is sets CF. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(blsr))] pub unsafe fn _blsr_u32(x: u32) -> u32 { @@ -70,7 +70,7 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 { /// Counts the number of trailing least significant zero bits. /// /// When the source operand is 0, it returns its size in bits. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(tzcnt))] pub unsafe fn _tzcnt_u32(x: u32) -> u32 { @@ -80,7 +80,7 @@ pub unsafe fn _tzcnt_u32(x: u32) -> u32 { /// Counts the number of trailing least significant zero bits. /// /// When the source operand is 0, it returns its size in bits. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(tzcnt))] pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 { diff --git a/library/stdarch/coresimd/src/x86/i586/bmi2.rs b/library/stdarch/coresimd/src/x86/i586/bmi2.rs index adc963e0f534..e4d393f99064 100644 --- a/library/stdarch/coresimd/src/x86/i586/bmi2.rs +++ b/library/stdarch/coresimd/src/x86/i586/bmi2.rs @@ -17,7 +17,7 @@ use stdsimd_test::assert_instr; /// /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with /// the low half and the high half of the result. -#[inline(always)] +#[inline] // LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))] #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))] @@ -29,7 +29,7 @@ pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 { } /// Zero higher bits of `a` >= `index`. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi2")] #[cfg_attr(test, assert_instr(bzhi))] pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 { @@ -38,7 +38,7 @@ pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 { /// Scatter contiguous low order bits of `a` to the result at the positions /// specified by the `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi2")] #[cfg_attr(test, assert_instr(pdep))] pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 { @@ -47,7 +47,7 @@ pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 { /// Gathers the bits of `x` specified by the `mask` into the contiguous low /// order bit positions of the result. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi2")] #[cfg_attr(test, assert_instr(pext))] pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 { diff --git a/library/stdarch/coresimd/src/x86/i586/bswap.rs b/library/stdarch/coresimd/src/x86/i586/bswap.rs index 8bac16756922..92f1634bf506 100644 --- a/library/stdarch/coresimd/src/x86/i586/bswap.rs +++ b/library/stdarch/coresimd/src/x86/i586/bswap.rs @@ -6,14 +6,14 @@ use stdsimd_test::assert_instr; /// Return an integer with the reversed byte order of x -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(bswap))] pub unsafe fn _bswap(x: i32) -> i32 { bswap_i32(x) } /// Return an integer with the reversed byte order of x -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(bswap))] pub unsafe fn _bswap64(x: i64) -> i64 { bswap_i64(x) diff --git a/library/stdarch/coresimd/src/x86/i586/cpuid.rs b/library/stdarch/coresimd/src/x86/i586/cpuid.rs index 2480eb58e065..eeb7ac3681ea 100644 --- a/library/stdarch/coresimd/src/x86/i586/cpuid.rs +++ b/library/stdarch/coresimd/src/x86/i586/cpuid.rs @@ -42,7 +42,7 @@ pub struct CpuidResult { /// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(cpuid))] pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult { let mut r = ::core::mem::uninitialized::(); @@ -62,14 +62,14 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult { } /// See [`__cpuid_count`](fn.__cpuid_count.html). -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(cpuid))] pub unsafe fn __cpuid(leaf: u32) -> CpuidResult { __cpuid_count(leaf, 0) } /// Does the host support the `cpuid` instruction? -#[inline(always)] +#[inline] pub fn has_cpuid() -> bool { #[cfg(target_arch = "x86_64")] { @@ -111,7 +111,7 @@ pub fn has_cpuid() -> bool { /// /// See also [`__cpuid`](fn.__cpuid.html) and /// [`__cpuid_count`](fn.__cpuid_count.html). -#[inline(always)] +#[inline] pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) { let CpuidResult { eax, ebx, .. } = __cpuid(leaf); (eax, ebx) diff --git a/library/stdarch/coresimd/src/x86/i586/rdtsc.rs b/library/stdarch/coresimd/src/x86/i586/rdtsc.rs index f9929aaa6b0f..9649562cdcf8 100644 --- a/library/stdarch/coresimd/src/x86/i586/rdtsc.rs +++ b/library/stdarch/coresimd/src/x86/i586/rdtsc.rs @@ -15,7 +15,7 @@ use stdsimd_test::assert_instr; /// /// On processors that support the Intel 64 architecture, the /// high-order 32 bits of each of RAX and RDX are cleared. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rdtsc))] pub unsafe fn _rdtsc() -> u64 { rdtsc() @@ -35,7 +35,7 @@ pub unsafe fn _rdtsc() -> u64 { /// /// On processors that support the Intel 64 architecture, the /// high-order 32 bits of each of RAX, RDX, and RCX are cleared. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(rdtscp))] pub unsafe fn _rdtscp(aux: *mut u32) -> u64 { rdtscp(aux as *mut _) diff --git a/library/stdarch/coresimd/src/x86/i586/sse.rs b/library/stdarch/coresimd/src/x86/i586/sse.rs index 8911429a486f..57b3f42a2406 100644 --- a/library/stdarch/coresimd/src/x86/i586/sse.rs +++ b/library/stdarch/coresimd/src/x86/i586/sse.rs @@ -13,7 +13,7 @@ use stdsimd_test::assert_instr; /// Adds the first component of `a` and `b`, the other components are copied /// from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(addss))] pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { @@ -21,7 +21,7 @@ pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { } /// Adds __m128 vectors. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(addps))] pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { @@ -30,7 +30,7 @@ pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { /// Subtracts the first component of `b` from `a`, the other components are /// copied from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(subss))] pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { @@ -38,7 +38,7 @@ pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { } /// Subtracts __m128 vectors. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(subps))] pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { @@ -47,7 +47,7 @@ pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { /// Multiplies the first component of `a` and `b`, the other components are /// copied from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(mulss))] pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { @@ -55,7 +55,7 @@ pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { } /// Multiplies __m128 vectors. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(mulps))] pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { @@ -64,7 +64,7 @@ pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { /// Divides the first component of `b` by `a`, the other components are /// copied from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(divss))] pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { @@ -72,7 +72,7 @@ pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { } /// Divides __m128 vectors. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(divps))] pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { @@ -81,7 +81,7 @@ pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { /// Return the square root of the first single-precision (32-bit) /// floating-point element in `a`, the other elements are unchanged. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(sqrtss))] pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 { @@ -90,7 +90,7 @@ pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 { /// Return the square root of packed single-precision (32-bit) floating-point /// elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(sqrtps))] pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 { @@ -99,7 +99,7 @@ pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 { /// Return the approximate reciprocal of the first single-precision /// (32-bit) floating-point element in `a`, the other elements are unchanged. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(rcpss))] pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 { @@ -108,7 +108,7 @@ pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 { /// Return the approximate reciprocal of packed single-precision (32-bit) /// floating-point elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(rcpps))] pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 { @@ -117,7 +117,7 @@ pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 { /// Return the approximate reciprocal square root of the fist single-precision /// (32-bit) floating-point elements in `a`, the other elements are unchanged. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(rsqrtss))] pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 { @@ -126,7 +126,7 @@ pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 { /// Return the approximate reciprocal square root of packed single-precision /// (32-bit) floating-point elements in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(rsqrtps))] pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 { @@ -136,7 +136,7 @@ pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 { /// Compare the first single-precision (32-bit) floating-point element of `a` /// and `b`, and return the minimum value in the first element of the return /// value, the other elements are copied from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(minss))] pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { @@ -145,7 +145,7 @@ pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { /// Compare packed single-precision (32-bit) floating-point elements in `a` and /// `b`, and return the corresponding minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(minps))] pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { @@ -155,7 +155,7 @@ pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { /// Compare the first single-precision (32-bit) floating-point element of `a` /// and `b`, and return the maximum value in the first element of the return /// value, the other elements are copied from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(maxss))] pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { @@ -164,7 +164,7 @@ pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { /// Compare packed single-precision (32-bit) floating-point elements in `a` and /// `b`, and return the corresponding maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(maxps))] pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { @@ -172,7 +172,7 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { } /// Bitwise AND of packed single-precision (32-bit) floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `and` instructions, so ignore it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), @@ -187,7 +187,7 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { /// elements. /// /// Computes `!a & b` for each bit in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `not` and `and` instructions, so ignore // it. @@ -201,7 +201,7 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { } /// Bitwise OR of packed single-precision (32-bit) floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `or` instructions, so we ignore it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), @@ -214,7 +214,7 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point /// elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // i586 only seems to generate plain `xor` instructions, so we ignore it. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")), @@ -228,7 +228,7 @@ pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { /// Compare the lowest `f32` of both inputs for equality. The lowest 32 bits of /// the result will be `0xffffffff` if the two inputs are equal, or `0` /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpeqss))] pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { @@ -239,7 +239,7 @@ pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { /// of the result will be `0xffffffff` if `a.extract(0)` is less than /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the /// upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpltss))] pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { @@ -250,7 +250,7 @@ pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result /// are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpless))] pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { @@ -261,7 +261,7 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result /// are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpltss))] pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { @@ -272,7 +272,7 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits /// of the result are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpless))] pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { @@ -283,7 +283,7 @@ pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the /// upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpneqss))] pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { @@ -294,7 +294,7 @@ pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the /// upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnltss))] pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { @@ -305,7 +305,7 @@ pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits /// of the result are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnless))] pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { @@ -316,7 +316,7 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are /// the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnltss))] pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { @@ -327,7 +327,7 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 /// bits of the result are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnless))] pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { @@ -338,7 +338,7 @@ pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { /// the result will be `0xffffffff` if neither of `a.extract(0)` or /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result /// are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpordss))] pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { @@ -349,7 +349,7 @@ pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { /// of the result will be `0xffffffff` if any of `a.extract(0)` or /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result /// are the upper 96 bits of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpunordss))] pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { @@ -359,7 +359,7 @@ pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { /// Compare each of the four floats in `a` to the corresponding element in `b`. /// The result in the output vector will be `0xffffffff` if the input elements /// were equal, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpeqps))] pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { @@ -369,7 +369,7 @@ pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { /// Compare each of the four floats in `a` to the corresponding element in `b`. /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is less than the corresponding element in `b`, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpltps))] pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { @@ -380,7 +380,7 @@ pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is less than or equal to the corresponding element in `b`, or `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpleps))] pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { @@ -390,7 +390,7 @@ pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { /// Compare each of the four floats in `a` to the corresponding element in `b`. /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is greater than the corresponding element in `b`, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpltps))] pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { @@ -401,7 +401,7 @@ pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is greater than or equal to the corresponding element in `b`, or `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpleps))] pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { @@ -411,7 +411,7 @@ pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { /// Compare each of the four floats in `a` to the corresponding element in `b`. /// The result in the output vector will be `0xffffffff` if the input elements /// are *not* equal, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpneqps))] pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { @@ -422,7 +422,7 @@ pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is *not* less than the corresponding element in `b`, or `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnltps))] pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { @@ -433,7 +433,7 @@ pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is *not* less than or equal to the corresponding element in `b`, or /// `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnleps))] pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { @@ -444,7 +444,7 @@ pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is *not* greater than the corresponding element in `b`, or `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnltps))] pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { @@ -455,7 +455,7 @@ pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { /// The result in the output vector will be `0xffffffff` if the input element /// in `a` is *not* greater than or equal to the corresponding element in `b`, /// or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpnleps))] pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { @@ -466,7 +466,7 @@ pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { /// Returns four floats that have one of two possible bit patterns. The element /// in the output vector will be `0xffffffff` if the input elements in `a` and /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpordps))] pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { @@ -477,7 +477,7 @@ pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { /// Returns four floats that have one of two possible bit patterns. The element /// in the output vector will be `0xffffffff` if the input elements in `a` and /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cmpunordps))] pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { @@ -486,7 +486,7 @@ pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if they are equal, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(comiss))] pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { @@ -495,7 +495,7 @@ pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(comiss))] pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { @@ -505,7 +505,7 @@ pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(comiss))] pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { @@ -515,7 +515,7 @@ pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if the value from `a` is greater than the one from `b`, or `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(comiss))] pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { @@ -525,7 +525,7 @@ pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if the value from `a` is greater than or equal to the one from `b`, or /// `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(comiss))] pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { @@ -534,7 +534,7 @@ pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if they are *not* equal, or `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(comiss))] pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { @@ -544,7 +544,7 @@ pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if they are equal, or `0` otherwise. This instruction will not signal /// an exception if either argument is a quiet NaN. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ucomiss))] pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { @@ -555,7 +555,7 @@ pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. /// This instruction will not signal an exception if either argument is a quiet /// NaN. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ucomiss))] pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { @@ -566,7 +566,7 @@ pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` /// otherwise. This instruction will not signal an exception if either argument /// is a quiet NaN. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ucomiss))] pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { @@ -577,7 +577,7 @@ pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { /// `1` if the value from `a` is greater than the one from `b`, or `0` /// otherwise. This instruction will not signal an exception if either argument /// is a quiet NaN. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ucomiss))] pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { @@ -588,7 +588,7 @@ pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { /// `1` if the value from `a` is greater than or equal to the one from `b`, or /// `0` otherwise. This instruction will not signal an exception if either /// argument is a quiet NaN. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ucomiss))] pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { @@ -598,7 +598,7 @@ pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns /// `1` if they are *not* equal, or `0` otherwise. This instruction will not /// signal an exception if either argument is a quiet NaN. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ucomiss))] pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { @@ -613,7 +613,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). /// /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvtss2si))] pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 { @@ -621,7 +621,7 @@ pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 { } /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvtss2si))] pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { @@ -638,7 +638,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). /// /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvttss2si))] pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 { @@ -646,7 +646,7 @@ pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 { } /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvttss2si))] pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 { @@ -654,7 +654,7 @@ pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 { } /// Extract the lowest 32 bit float from the input vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // No point in using assert_instrs. In Unix x86_64 calling convention this is a // no-op, and on Windows it's just a `mov`. @@ -667,7 +667,7 @@ pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 { /// /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit /// input). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvtsi2ss))] pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { @@ -675,7 +675,7 @@ pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { } /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvtsi2ss))] pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { @@ -684,7 +684,7 @@ pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { /// Construct a `__m128` with the lowest element set to `a` and the rest set to /// zero. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movss))] pub unsafe fn _mm_set_ss(a: f32) -> __m128 { @@ -692,7 +692,7 @@ pub unsafe fn _mm_set_ss(a: f32) -> __m128 { } /// Construct a `__m128` with all element set to `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(shufps))] pub unsafe fn _mm_set1_ps(a: f32) -> __m128 { @@ -700,7 +700,7 @@ pub unsafe fn _mm_set1_ps(a: f32) -> __m128 { } /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html) -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(shufps))] pub unsafe fn _mm_set_ps1(a: f32) -> __m128 { @@ -724,7 +724,7 @@ pub unsafe fn _mm_set_ps1(a: f32) -> __m128 { /// ```text /// let v = _mm_set_ps(d, c, b, a); /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(unpcklps))] pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { @@ -739,7 +739,7 @@ pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { /// ```text /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d)); /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))] // On a 32-bit architecture it just copies the operands from the stack. @@ -749,7 +749,7 @@ pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { } /// Construct a `__m128` with all elements initialized to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(xorps))] pub unsafe fn _mm_setzero_ps() -> __m128 { @@ -761,7 +761,7 @@ pub unsafe fn _mm_setzero_ps() -> __m128 { /// /// The lower half of result takes values from `a` and the higher half from /// `b`. Mask is split to 2 control bits each to index the element from inputs. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(shufps, mask = 3))] pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 { @@ -812,7 +812,7 @@ pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 { /// Unpack and interleave single-precision (32-bit) floating-point elements /// from the higher half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(unpckhps))] pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { @@ -821,7 +821,7 @@ pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { /// Unpack and interleave single-precision (32-bit) floating-point elements /// from the lower half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(unpcklps))] pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { @@ -830,7 +830,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the /// lower half of result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))] #[cfg_attr(all(test, windows), assert_instr(unpckhpd))] @@ -841,7 +841,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 { /// Combine lower half of `a` and `b`. The lower half of `b` occupies the /// higher half of result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(unpcklpd))] #[cfg_attr(all(test, not(target_feature = "sse2")), assert_instr(movlhps))] @@ -853,7 +853,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { /// /// The mask is stored in the 4 least significant bits of the return value. /// All other bits are set to `0`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movmskps))] pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { @@ -892,7 +892,7 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // TODO: generates MOVHPD if the CPU supports SSE2. // #[cfg_attr(test, assert_instr(movhps))] @@ -943,7 +943,7 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 { /// # } /// # } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // TODO: generates MOVLPD if the CPU supports SSE2. // #[cfg_attr(test, assert_instr(movlps))] @@ -966,7 +966,7 @@ pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 { /// elements set to zero. /// /// This corresponds to instructions `VMOVSS` / `MOVSS`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movss))] pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { @@ -978,7 +978,7 @@ pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { /// /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some /// shuffling. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movss))] pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { @@ -987,7 +987,7 @@ pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { } /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movss))] pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { @@ -1002,7 +1002,7 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { /// memory. /// /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { @@ -1016,7 +1016,7 @@ pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { /// may be faster. /// /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movups))] pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { @@ -1049,7 +1049,7 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { /// /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some /// shuffling. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { @@ -1061,7 +1061,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { /// /// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may /// choose to generate an equivalent sequence of other instructions. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's // fine. @@ -1091,7 +1091,7 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) { /// /// This intrinsic corresponds to the `MOVQ` instruction. The compiler may /// choose to generate an equivalent sequence of other instructions. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] // On i586 the codegen just generates plane MOVs. No need to test for that. #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"), @@ -1121,7 +1121,7 @@ pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) { /// Store the lowest 32 bit float of `a` into memory. /// /// This intrinsic corresponds to the `MOVSS` instruction. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movss))] pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { @@ -1144,7 +1144,7 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { /// *p.offset(2) = x; /// *p.offset(3) = x; /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { @@ -1153,7 +1153,7 @@ pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { } /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html) -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { @@ -1169,7 +1169,7 @@ pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { /// memory. /// /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { @@ -1181,7 +1181,7 @@ pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { /// faster. /// /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movups))] pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { @@ -1206,7 +1206,7 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { /// *p.offset(2) = a.extract(1); /// *p.offset(3) = a.extract(0); /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { @@ -1221,7 +1221,7 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { /// ```text /// _mm_move_ss(a, b) == a.replace(0, b.extract(0)) /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movss))] pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { @@ -1234,7 +1234,7 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { /// Guarantees that every store instruction that precedes, in program order, is /// globally visible before any store instruction which follows the fence in /// program order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(sfence))] pub unsafe fn _mm_sfence() { @@ -1244,7 +1244,7 @@ pub unsafe fn _mm_sfence() { /// Get the unsigned 32-bit value of the MXCSR control and status register. /// /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(stmxcsr))] pub unsafe fn _mm_getcsr() -> u32 { @@ -1378,7 +1378,7 @@ pub unsafe fn _mm_getcsr() -> u32 { /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on /// ``` /// -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ldmxcsr))] pub unsafe fn _mm_setcsr(val: u32) { @@ -1435,7 +1435,7 @@ pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { @@ -1443,7 +1443,7 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { @@ -1451,7 +1451,7 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { @@ -1459,7 +1459,7 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { @@ -1467,7 +1467,7 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { @@ -1475,7 +1475,7 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { @@ -1483,7 +1483,7 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { @@ -1493,7 +1493,7 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { } /// See [`_mm_setcsr`](fn._mm_setcsr.html) -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { @@ -1548,7 +1548,7 @@ pub const _MM_HINT_NTA: i8 = 0; /// * Prefetching may also fail if there are not enough memory-subsystem /// resources (e.g., request buffers). /// -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))] #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))] @@ -1573,7 +1573,7 @@ pub unsafe fn _mm_prefetch(p: *const u8, strategy: i8) { } /// Return vector of type __m128 with undefined elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] pub unsafe fn _mm_undefined_ps() -> __m128 { __m128( @@ -1585,7 +1585,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 { } /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place. -#[inline(always)] +#[inline] #[allow(non_snake_case)] #[target_feature(enable = "sse")] pub unsafe fn _MM_TRANSPOSE4_PS( @@ -1684,7 +1684,7 @@ extern "C" { /// /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection /// exception _may_ be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movntps))] pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { @@ -1693,7 +1693,7 @@ pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { /// Store 64-bits of integer data from a into memory using a non-temporal /// memory hint. -#[inline(always)] +#[inline] #[target_feature(enable = "sse,mmx")] #[cfg_attr(test, assert_instr(movntq))] pub unsafe fn _mm_stream_pi(mem_addr: *mut __m64, a: __m64) { diff --git a/library/stdarch/coresimd/src/x86/i586/sse2.rs b/library/stdarch/coresimd/src/x86/i586/sse2.rs index c0555679bfe4..ab4a574e2d71 100644 --- a/library/stdarch/coresimd/src/x86/i586/sse2.rs +++ b/library/stdarch/coresimd/src/x86/i586/sse2.rs @@ -16,7 +16,7 @@ use x86::*; /// /// This can help improve the performance and power consumption of spin-wait /// loops. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pause))] pub unsafe fn _mm_pause() { @@ -25,7 +25,7 @@ pub unsafe fn _mm_pause() { /// Invalidate and flush the cache line that contains `p` from all levels of /// the cache hierarchy. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(clflush))] pub unsafe fn _mm_clflush(p: *mut u8) { @@ -38,7 +38,7 @@ pub unsafe fn _mm_clflush(p: *mut u8) { /// Guarantees that every load instruction that precedes, in program order, is /// globally visible before any load instruction which follows the fence in /// program order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(lfence))] pub unsafe fn _mm_lfence() { @@ -51,7 +51,7 @@ pub unsafe fn _mm_lfence() { /// Guarantees that every memory access that precedes, in program order, the /// memory fence instruction is globally visible before any memory instruction /// which follows the fence in program order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(mfence))] pub unsafe fn _mm_mfence() { @@ -59,7 +59,7 @@ pub unsafe fn _mm_mfence() { } /// Add packed 8-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddb))] pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -67,7 +67,7 @@ pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Add packed 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddw))] pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -75,7 +75,7 @@ pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Add packed 32-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddd))] pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -83,7 +83,7 @@ pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Add packed 64-bit integers in `a` and "b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddq))] pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i { @@ -91,7 +91,7 @@ pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i { } /// Add packed 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddsb))] pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -99,7 +99,7 @@ pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Add packed 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddsw))] pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -107,7 +107,7 @@ pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddusb))] pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i { @@ -115,7 +115,7 @@ pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i { } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(paddusw))] pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { @@ -123,7 +123,7 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { } /// Average packed unsigned 8-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pavgb))] pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { @@ -131,7 +131,7 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { } /// Average packed unsigned 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pavgw))] pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { @@ -143,7 +143,7 @@ pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { /// Multiply packed signed 16-bit integers in `a` and `b`, producing /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of /// intermediate 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmaddwd))] pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -152,7 +152,7 @@ pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { /// Compare packed 16-bit integers in `a` and `b`, and return the packed /// maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmaxsw))] pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -161,7 +161,7 @@ pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i { /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the /// packed maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmaxub))] pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i { @@ -170,7 +170,7 @@ pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i { /// Compare packed 16-bit integers in `a` and `b`, and return the packed /// minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pminsw))] pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -179,7 +179,7 @@ pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i { /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the /// packed minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pminub))] pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { @@ -190,7 +190,7 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { /// /// The multiplication produces intermediate 32-bit integers, and returns the /// high 16 bits of the intermediate integers. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmulhw))] pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -201,7 +201,7 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { /// /// The multiplication produces intermediate 32-bit integers, and returns the /// high 16 bits of the intermediate integers. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmulhuw))] pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { @@ -212,7 +212,7 @@ pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { /// /// The multiplication produces intermediate 32-bit integers, and returns the /// low 16 bits of the intermediate integers. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmullw))] pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -223,7 +223,7 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { /// in `a` and `b`. /// /// Return the unsigned 64-bit results. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmuludq))] pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { @@ -236,7 +236,7 @@ pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { /// and `b`, then horizontally sum each consecutive 8 differences to produce /// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in /// the low 16 bits of 64-bit elements returned. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psadbw))] pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i { @@ -244,7 +244,7 @@ pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i { } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubb))] pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -252,7 +252,7 @@ pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubw))] pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -260,7 +260,7 @@ pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubd))] pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -268,7 +268,7 @@ pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubq))] pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i { @@ -277,7 +277,7 @@ pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i { /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubsb))] pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -286,7 +286,7 @@ pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i { /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubsw))] pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -295,7 +295,7 @@ pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i { /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit /// integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubusb))] pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i { @@ -304,7 +304,7 @@ pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i { /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit /// integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psubusw))] pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i { @@ -312,7 +312,7 @@ pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i { } /// Shift `a` left by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))] pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { @@ -355,7 +355,7 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { } /// Shift `a` left by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))] pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i { @@ -363,7 +363,7 @@ pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i { } /// Shift `a` right by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))] pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i { @@ -371,7 +371,7 @@ pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i { } /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psllw))] pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i { @@ -380,7 +380,7 @@ pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 16-bit integers in `a` left by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psllw))] pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i { @@ -388,7 +388,7 @@ pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i { } /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pslld))] pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i { @@ -397,7 +397,7 @@ pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 32-bit integers in `a` left by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pslld))] pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i { @@ -405,7 +405,7 @@ pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i { } /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psllq))] pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i { @@ -414,7 +414,7 @@ pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 64-bit integers in `a` left by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psllq))] pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i { @@ -423,7 +423,7 @@ pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign /// bits. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psraw))] pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i { @@ -432,7 +432,7 @@ pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign /// bits. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psraw))] pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i { @@ -441,7 +441,7 @@ pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign /// bits. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrad))] pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i { @@ -450,7 +450,7 @@ pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign /// bits. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrad))] pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i { @@ -458,7 +458,7 @@ pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i { } /// Shift `a` right by `imm8` bytes while shifting in zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))] pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { @@ -502,7 +502,7 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrlw))] pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i { @@ -511,7 +511,7 @@ pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 16-bit integers in `a` right by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrlw))] pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i { @@ -520,7 +520,7 @@ pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrld))] pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i { @@ -529,7 +529,7 @@ pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 32-bit integers in `a` right by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrld))] pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i { @@ -538,7 +538,7 @@ pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i { /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrlq))] pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i { @@ -547,7 +547,7 @@ pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i { /// Shift packed 64-bit integers in `a` right by `count` while shifting in /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(psrlq))] pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i { @@ -556,7 +556,7 @@ pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i { /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and /// `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(andps))] pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i { @@ -565,7 +565,7 @@ pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i { /// Compute the bitwise NOT of 128 bits (representing integer data) in `a` and /// then AND with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(andnps))] pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i { @@ -574,7 +574,7 @@ pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i { /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and /// `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(orps))] pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i { @@ -583,7 +583,7 @@ pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i { /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and /// `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(xorps))] pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i { @@ -591,7 +591,7 @@ pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 8-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpeqb))] pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -599,7 +599,7 @@ pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 16-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpeqw))] pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -607,7 +607,7 @@ pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 32-bit integers in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpeqd))] pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -615,7 +615,7 @@ pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 8-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpgtb))] pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -623,7 +623,7 @@ pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 16-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpgtw))] pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -631,7 +631,7 @@ pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 32-bit integers in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpgtd))] pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -639,7 +639,7 @@ pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 8-bit integers in `a` and `b` for less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpgtb))] pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -647,7 +647,7 @@ pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 16-bit integers in `a` and `b` for less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpgtw))] pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -655,7 +655,7 @@ pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 32-bit integers in `a` and `b` for less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pcmpgtd))] pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -664,7 +664,7 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i { /// Convert the lower two packed 32-bit integers in `a` to packed /// double-precision (64-bit) floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtdq2pd))] pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d { @@ -674,7 +674,7 @@ pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d { /// Return `a` with its lower element replaced by `b` after converting it to /// an `f64`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsi2sd))] pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d { @@ -683,7 +683,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d { /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtdq2ps))] pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 { @@ -692,7 +692,7 @@ pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 { /// Convert packed single-precision (32-bit) floating-point elements in `a` /// to packed 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtps2dq))] pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i { @@ -701,7 +701,7 @@ pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i { /// Return a vector whose lowest element is `a` and all higher elements are /// `0`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))] pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i { @@ -709,7 +709,7 @@ pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i { } /// Return the lowest element of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movd))] // FIXME mov on windows pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 { @@ -718,7 +718,7 @@ pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 { /// Set packed 64-bit integers with the supplied values, from highest to /// lowest. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i { @@ -726,7 +726,7 @@ pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i { } /// Set packed 32-bit integers with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { @@ -734,7 +734,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { } /// Set packed 16-bit integers with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set_epi16( @@ -744,7 +744,7 @@ pub unsafe fn _mm_set_epi16( } /// Set packed 8-bit integers with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set_epi8( @@ -758,7 +758,7 @@ pub unsafe fn _mm_set_epi8( } /// Broadcast 64-bit integer `a` to all elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i { @@ -766,7 +766,7 @@ pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i { } /// Broadcast 32-bit integer `a` to all elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i { @@ -774,7 +774,7 @@ pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i { } /// Broadcast 16-bit integer `a` to all elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i { @@ -782,7 +782,7 @@ pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i { } /// Broadcast 8-bit integer `a` to all elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i { @@ -790,7 +790,7 @@ pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i { } /// Set packed 32-bit integers with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { @@ -798,7 +798,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { } /// Set packed 16-bit integers with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_setr_epi16( @@ -808,7 +808,7 @@ pub unsafe fn _mm_setr_epi16( } /// Set packed 8-bit integers with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // no particular instruction to test pub unsafe fn _mm_setr_epi8( @@ -822,7 +822,7 @@ pub unsafe fn _mm_setr_epi8( } /// Returns a vector with all elements set to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(xorps))] pub unsafe fn _mm_setzero_si128() -> __m128i { @@ -830,7 +830,7 @@ pub unsafe fn _mm_setzero_si128() -> __m128i { } /// Load 64-bit integer from memory into first element of returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // FIXME movsd on windows #[cfg_attr(all(test, not(windows), @@ -844,7 +844,7 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i { /// Load 128-bits of integer data from memory into a new vector. /// /// `mem_addr` must be aligned on a 16-byte boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i { @@ -854,7 +854,7 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i { /// Load 128-bits of integer data from memory into a new vector. /// /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movups))] pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { @@ -875,7 +875,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { /// /// `mem_addr` should correspond to a 128-bit memory location and does not need /// to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(maskmovdqu))] pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) { @@ -885,7 +885,7 @@ pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) /// Store 128-bits of integer data from `a` into memory. /// /// `mem_addr` must be aligned on a 16-byte boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) { @@ -895,7 +895,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) { /// Store 128-bits of integer data from `a` into memory. /// /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { @@ -905,7 +905,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) { /// Store the lower 64-bit integer `a` to a memory location. /// /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // FIXME mov on windows, movlps on i686 #[cfg_attr(all(test, not(windows), @@ -923,7 +923,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { /// Stores a 128-bit integer vector to a 128-bit aligned memory location. /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { @@ -933,7 +933,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { /// Stores a 32-bit integer value in the specified memory location. /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movnti))] pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { @@ -942,7 +942,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { /// Return a vector where the low element is extracted from `a` and its upper /// element is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // FIXME movd on windows, movd on i686 #[cfg_attr(all(test, not(windows), target_arch = "x86_64"), @@ -955,7 +955,7 @@ pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i { /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using signed saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(packsswb))] pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -964,7 +964,7 @@ pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers /// using signed saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(packssdw))] pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -973,7 +973,7 @@ pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using unsigned saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(packuswb))] pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -981,7 +981,7 @@ pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Return the `imm8` element of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pextrw, imm8 = 9))] pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 { @@ -989,7 +989,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 { } /// Return a new vector where the `imm8` element of `a` is replaced with `i`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pinsrw, imm8 = 9))] pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i { @@ -997,7 +997,7 @@ pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i { } /// Return a mask of the most significant bit of each element in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmovmskb))] pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 { @@ -1005,7 +1005,7 @@ pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 { } /// Shuffle 32-bit integers in `a` using the control in `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pshufd, imm8 = 9))] pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i { @@ -1068,7 +1068,7 @@ pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i { /// /// Put the results in the high 64 bits of the returned vector, with the low 64 /// bits being copied from from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pshufhw, imm8 = 9))] pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i { @@ -1126,7 +1126,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i { /// /// Put the results in the low 64 bits of the returned vector, with the high 64 /// bits being copied from from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pshuflw, imm8 = 9))] pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i { @@ -1179,7 +1179,7 @@ pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i { } /// Unpack and interleave 8-bit integers from the high half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhbw))] pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -1191,7 +1191,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 16-bit integers from the high half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhwd))] pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -1200,7 +1200,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 32-bit integers from the high half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhdq))] pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -1208,7 +1208,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhqdq))] pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i { @@ -1216,7 +1216,7 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpcklbw))] pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -1228,7 +1228,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 16-bit integers from the low half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpcklwd))] pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -1237,7 +1237,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 32-bit integers from the low half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckldq))] pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -1245,7 +1245,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpcklqdq))] pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i { @@ -1254,7 +1254,7 @@ pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i { /// Return a new vector with the low element of `a` replaced by the sum of the /// low elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(addsd))] pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1263,7 +1263,7 @@ pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d { /// Add packed double-precision (64-bit) floating-point elements in `a` and /// `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(addpd))] pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1272,7 +1272,7 @@ pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the result of /// diving the lower element of `a` by the lower element of `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(divsd))] pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1281,7 +1281,7 @@ pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d { /// Divide packed double-precision (64-bit) floating-point elements in `a` by /// packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(divpd))] pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1290,7 +1290,7 @@ pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the maximum /// of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(maxsd))] pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1299,7 +1299,7 @@ pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the maximum values from corresponding elements in /// `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(maxpd))] pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1308,7 +1308,7 @@ pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the minimum /// of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(minsd))] pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1317,7 +1317,7 @@ pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the minimum values from corresponding elements in /// `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(minpd))] pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1326,7 +1326,7 @@ pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by multiplying the /// low elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(mulsd))] pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1335,7 +1335,7 @@ pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d { /// Multiply packed double-precision (64-bit) floating-point elements in `a` /// and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(mulpd))] pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1344,7 +1344,7 @@ pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the square /// root of the lower element `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(sqrtsd))] pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1352,7 +1352,7 @@ pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d { } /// Return a new vector with the square root of each of the values in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(sqrtpd))] pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d { @@ -1361,7 +1361,7 @@ pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by subtracting the /// low element by `b` from the low element of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(subsd))] pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1370,7 +1370,7 @@ pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d { /// Subtract packed double-precision (64-bit) floating-point elements in `b` /// from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(subpd))] pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1379,7 +1379,7 @@ pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d { /// Compute the bitwise AND of packed double-precision (64-bit) floating-point /// elements in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(andps))] pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1389,7 +1389,7 @@ pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compute the bitwise NOT of `a` and then AND with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(andnps))] pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1399,7 +1399,7 @@ pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compute the bitwise OR of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(orps))] pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1409,7 +1409,7 @@ pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compute the bitwise OR of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(xorps))] pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1420,7 +1420,7 @@ pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the equality /// comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpeqsd))] pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1429,7 +1429,7 @@ pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the less-than /// comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpltsd))] pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1438,7 +1438,7 @@ pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// less-than-or-equal comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmplesd))] pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1447,7 +1447,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// greater-than comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpltsd))] pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1456,7 +1456,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// greater-than-or-equal comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmplesd))] pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1467,7 +1467,7 @@ pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d { /// of comparing both of the lower elements of `a` and `b` to `NaN`. If /// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpordsd))] pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1477,7 +1477,7 @@ pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the result of /// comparing both of the lower elements of `a` and `b` to `NaN`. If either is /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpunordsd))] pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1486,7 +1486,7 @@ pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the not-equal /// comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpneqsd))] pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1495,7 +1495,7 @@ pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// not-less-than comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnltsd))] pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1504,7 +1504,7 @@ pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// not-less-than-or-equal comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnlesd))] pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1513,7 +1513,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// not-greater-than comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnltsd))] pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1522,7 +1522,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d { /// Return a new vector with the low element of `a` replaced by the /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnlesd))] pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d { @@ -1530,7 +1530,7 @@ pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpeqpd))] pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1538,7 +1538,7 @@ pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpltpd))] pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1546,7 +1546,7 @@ pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for less-than-or-equal -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmplepd))] pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1554,7 +1554,7 @@ pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpltpd))] pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1562,7 +1562,7 @@ pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for greater-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmplepd))] pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1570,7 +1570,7 @@ pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` to see if neither is `NaN`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpordpd))] pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1578,7 +1578,7 @@ pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` to see if either is `NaN`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpunordpd))] pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1586,7 +1586,7 @@ pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for not-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpneqpd))] pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1594,7 +1594,7 @@ pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for not-less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnltpd))] pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1602,7 +1602,7 @@ pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for not-less-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnlepd))] pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1610,7 +1610,7 @@ pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare corresponding elements in `a` and `b` for not-greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnltpd))] pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1619,7 +1619,7 @@ pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d { /// Compare corresponding elements in `a` and `b` for /// not-greater-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cmpnlepd))] pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d { @@ -1627,7 +1627,7 @@ pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d { } /// Compare the lower element of `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> bool { @@ -1635,7 +1635,7 @@ pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> bool { @@ -1643,7 +1643,7 @@ pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for less-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> bool { @@ -1651,7 +1651,7 @@ pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> bool { @@ -1659,7 +1659,7 @@ pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for greater-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> bool { @@ -1667,7 +1667,7 @@ pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for not-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> bool { @@ -1675,7 +1675,7 @@ pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for equality. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> bool { @@ -1683,7 +1683,7 @@ pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for less-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> bool { @@ -1691,7 +1691,7 @@ pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for less-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> bool { @@ -1699,7 +1699,7 @@ pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for greater-than. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> bool { @@ -1707,7 +1707,7 @@ pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for greater-than-or-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> bool { @@ -1715,7 +1715,7 @@ pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> bool { } /// Compare the lower element of `a` and `b` for not-equal. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> bool { @@ -1724,7 +1724,7 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> bool { /// Convert packed double-precision (64-bit) floating-point elements in "a" to /// packed single-precision (32-bit) floating-point elements -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtpd2ps))] pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { @@ -1734,7 +1734,7 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { /// Convert packed single-precision (32-bit) floating-point elements in `a` to /// packed /// double-precision (64-bit) floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtps2pd))] pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d { @@ -1743,7 +1743,7 @@ pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d { /// Convert packed double-precision (64-bit) floating-point elements in `a` to /// packed 32-bit integers. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtpd2dq))] pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i { @@ -1752,7 +1752,7 @@ pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i { /// Convert the lower double-precision (64-bit) floating-point element in a to /// a 32-bit integer. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsd2si))] pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 { @@ -1763,7 +1763,7 @@ pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 { /// to a single-precision (32-bit) floating-point element, store the result in /// the lower element of the return value, and copy the upper element from `a` /// to the upper element the return value. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsd2ss))] pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 { @@ -1771,7 +1771,7 @@ pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 { } /// Return the lower double-precision (64-bit) floating-point element of "a". -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, windows), assert_instr(movsd))] // FIXME movq/movlps/mov on other platform pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 { @@ -1782,7 +1782,7 @@ pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 { /// to a double-precision (64-bit) floating-point element, store the result in /// the lower element of the return value, and copy the upper element from `a` /// to the upper element the return value. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtss2sd))] pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d { @@ -1791,7 +1791,7 @@ pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d { /// Convert packed double-precision (64-bit) floating-point elements in `a` to /// packed 32-bit integers with truncation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvttpd2dq))] pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i { @@ -1800,7 +1800,7 @@ pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i { /// Convert the lower double-precision (64-bit) floating-point element in `a` /// to a 32-bit integer with truncation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvttsd2si))] pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 { @@ -1809,7 +1809,7 @@ pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 { /// Convert packed single-precision (32-bit) floating-point elements in `a` to /// packed 32-bit integers with truncation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvttps2dq))] pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i { @@ -1818,7 +1818,7 @@ pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i { /// Copy double-precision (64-bit) floating-point element `a` to the lower /// element of the packed 64-bit return value. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_set_sd(a: f64) -> __m128d { _mm_set_pd(0.0, a) @@ -1826,7 +1826,7 @@ pub unsafe fn _mm_set_sd(a: f64) -> __m128d { /// Broadcast double-precision (64-bit) floating-point value a to all elements /// of the return value. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_set1_pd(a: f64) -> __m128d { _mm_set_pd(a, a) @@ -1834,7 +1834,7 @@ pub unsafe fn _mm_set1_pd(a: f64) -> __m128d { /// Broadcast double-precision (64-bit) floating-point value a to all elements /// of the return value. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_set_pd1(a: f64) -> __m128d { _mm_set_pd(a, a) @@ -1842,7 +1842,7 @@ pub unsafe fn _mm_set_pd1(a: f64) -> __m128d { /// Set packed double-precision (64-bit) floating-point elements in the return /// value with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d { __m128d(b, a) @@ -1850,7 +1850,7 @@ pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d { /// Set packed double-precision (64-bit) floating-point elements in the return /// value with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d { _mm_set_pd(b, a) @@ -1858,7 +1858,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d { /// Returns packed double-precision (64-bit) floating-point elements with all /// zeros. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected pub unsafe fn _mm_setzero_pd() -> __m128d { @@ -1869,7 +1869,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d { /// /// The mask is stored in the 2 least significant bits of the return value. /// All other bits are set to `0`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movmskpd))] pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 { @@ -1880,7 +1880,7 @@ pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 { /// floating-point elements) from memory into the returned vector. /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection /// exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d { @@ -1889,7 +1889,7 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d { /// Loads a 64-bit double-precision value to the low element of a /// 128-bit integer vector and clears the upper element. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movsd))] pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d { @@ -1899,7 +1899,7 @@ pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d { /// Loads a double-precision value into the high-order bits of a 128-bit /// vector of [2 x double]. The low-order bits are copied from the low-order /// bits of the first operand. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movhpd))] pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d { @@ -1909,7 +1909,7 @@ pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d { /// Loads a double-precision value into the low-order bits of a 128-bit /// vector of [2 x double]. The high-order bits are copied from the /// high-order bits of the first operand. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movlpd))] pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { @@ -1920,7 +1920,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { /// aligned memory location. /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { @@ -1929,7 +1929,7 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a /// memory location. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movsd only on windows pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) { @@ -1939,7 +1939,7 @@ pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) { /// Store 128-bits (composed of 2 packed double-precision (64-bit) /// floating-point elements) from `a` into memory. `mem_addr` must be aligned /// on a 16-byte boundary or a general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movaps))] pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) { @@ -1949,7 +1949,7 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) { /// Store 128-bits (composed of 2 packed double-precision (64-bit) /// floating-point elements) from `a` into memory. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) { @@ -1959,7 +1959,7 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) { /// Store the lower double-precision (64-bit) floating-point element from `a` /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a /// 16-byte boundary or a general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) { let b: __m128d = simd_shuffle2(a, a, [0, 0]); @@ -1969,7 +1969,7 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) { /// Store the lower double-precision (64-bit) floating-point element from `a` /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a /// 16-byte boundary or a general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) { let b: __m128d = simd_shuffle2(a, a, [0, 0]); @@ -1980,7 +1980,7 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) { /// memory in reverse order. /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection /// exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) { let b: __m128d = simd_shuffle2(a, a, [1, 0]); @@ -1989,7 +1989,7 @@ pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) { /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a /// memory location. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movhpd))] pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) { @@ -1998,7 +1998,7 @@ pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) { /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a /// memory location. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movlpd (movsd on windows) pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) { @@ -2007,7 +2007,7 @@ pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) { /// Load a double-precision (64-bit) floating-point element from memory /// into both elements of returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d { @@ -2017,7 +2017,7 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d { /// Load a double-precision (64-bit) floating-point element from memory /// into both elements of returned vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] // #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d { @@ -2027,7 +2027,7 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d { /// Load 2 double-precision (64-bit) floating-point elements from memory into /// the returned vector in reverse order. `mem_addr` must be aligned on a /// 16-byte boundary or a general-protection exception may be generated. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movapd))] pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d { @@ -2038,7 +2038,7 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d { /// Load 128-bits (composed of 2 packed double-precision (64-bit) /// floating-point elements) from memory into the returned vector. /// `mem_addr` does not need to be aligned on any particular boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movups))] pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d { @@ -2054,7 +2054,7 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d { /// Constructs a 128-bit floating-point vector of [2 x double] from two /// 128-bit vector parameters of [2 x double], using the immediate-value /// parameter as a specifier. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(shufpd, imm8 = 1))] pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { @@ -2069,7 +2069,7 @@ pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { /// Constructs a 128-bit floating-point vector of [2 x double]. The lower /// 64 bits are set to the lower 64 bits of the second parameter. The upper /// 64 bits are set to the upper 64 bits of the first parameter. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movsd))] pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d { @@ -2078,7 +2078,7 @@ pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d { /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit /// floating-point vector of [4 x float]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 { mem::transmute(a) @@ -2086,7 +2086,7 @@ pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 { /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit /// integer vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i { mem::transmute::(simd_cast(a)) @@ -2094,7 +2094,7 @@ pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i { /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit /// floating-point vector of [2 x double]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d { mem::transmute(a) @@ -2102,7 +2102,7 @@ pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d { /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit /// integer vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i { mem::transmute(a) @@ -2110,7 +2110,7 @@ pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i { /// Casts a 128-bit integer vector into a 128-bit floating-point vector /// of [2 x double]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d { simd_cast(a.as_i64x2()) @@ -2118,21 +2118,21 @@ pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d { /// Casts a 128-bit integer vector into a 128-bit floating-point vector /// of [4 x float]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 { mem::transmute(a) } /// Return vector of type __m128d with undefined elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_undefined_pd() -> __m128d { _mm_set1_pd(mem::uninitialized()) } /// Return vector of type __m128i with undefined elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] pub unsafe fn _mm_undefined_si128() -> __m128i { _mm_set1_epi8(mem::uninitialized()) @@ -2143,7 +2143,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i { /// /// * The [127:64] bits are copied from the [127:64] bits of the second input /// * The [63:0] bits are copied from the [127:64] bits of the first input -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(unpckhpd))] pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d { @@ -2155,7 +2155,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d { /// /// * The [127:64] bits are copied from the [63:0] bits of the second input /// * The [63:0] bits are copied from the [63:0] bits of the first input -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(unpcklpd))] pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d { diff --git a/library/stdarch/coresimd/src/x86/i586/sse3.rs b/library/stdarch/coresimd/src/x86/i586/sse3.rs index cf26319612c0..f74341dffef2 100644 --- a/library/stdarch/coresimd/src/x86/i586/sse3.rs +++ b/library/stdarch/coresimd/src/x86/i586/sse3.rs @@ -9,7 +9,7 @@ use stdsimd_test::assert_instr; /// Alternatively add and subtract packed single-precision (32-bit) /// floating-point elements in `a` to/from packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(addsubps))] pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { @@ -18,7 +18,7 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { /// Alternatively add and subtract packed double-precision (64-bit) /// floating-point elements in `a` to/from packed elements in `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(addsubpd))] pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d { @@ -27,7 +27,7 @@ pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d { /// Horizontally add adjacent pairs of double-precision (64-bit) /// floating-point elements in `a` and `b`, and pack the results. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(haddpd))] pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d { @@ -36,7 +36,7 @@ pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d { /// Horizontally add adjacent pairs of single-precision (32-bit) /// floating-point elements in `a` and `b`, and pack the results. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(haddps))] pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 { @@ -45,7 +45,7 @@ pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 { /// Horizontally subtract adjacent pairs of double-precision (64-bit) /// floating-point elements in `a` and `b`, and pack the results. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(hsubpd))] pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d { @@ -54,7 +54,7 @@ pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d { /// Horizontally add adjacent pairs of single-precision (32-bit) /// floating-point elements in `a` and `b`, and pack the results. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(hsubps))] pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 { @@ -64,7 +64,7 @@ pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 { /// Load 128-bits of integer data from unaligned memory. /// This intrinsic may perform better than `_mm_loadu_si128` /// when the data crosses a cache line boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(lddqu))] pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i { @@ -73,7 +73,7 @@ pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i { /// Duplicate the low double-precision (64-bit) floating-point element /// from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(movddup))] pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d { @@ -82,7 +82,7 @@ pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d { /// Load a double-precision (64-bit) floating-point element from memory /// into both elements of return vector. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(movddup))] pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d { @@ -91,7 +91,7 @@ pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d { /// Duplicate odd-indexed single-precision (32-bit) floating-point elements /// from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(movshdup))] pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 { @@ -100,7 +100,7 @@ pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 { /// Duplicate even-indexed single-precision (32-bit) floating-point elements /// from `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse3")] #[cfg_attr(test, assert_instr(movsldup))] pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 { diff --git a/library/stdarch/coresimd/src/x86/i586/sse41.rs b/library/stdarch/coresimd/src/x86/i586/sse41.rs index bd63ed312202..4ca8397bd82d 100644 --- a/library/stdarch/coresimd/src/x86/i586/sse41.rs +++ b/library/stdarch/coresimd/src/x86/i586/sse41.rs @@ -47,7 +47,7 @@ pub const _MM_FROUND_NEARBYINT: i32 = /// The high bit of each corresponding mask byte determines the selection. /// If the high bit is set the element of `a` is selected. The element /// of `b` is selected otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pblendvb))] pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { @@ -59,7 +59,7 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i /// The mask bits determine the selection. A clear bit selects the /// corresponding element of `a`, and a set bit the corresponding /// element of `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))] pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { @@ -73,7 +73,7 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { /// Blend packed double-precision (64-bit) floating-point elements from `a` /// and `b` using `mask` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(blendvpd))] pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { @@ -82,7 +82,7 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { /// Blend packed single-precision (32-bit) floating-point elements from `a` /// and `b` using `mask` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(blendvps))] pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { @@ -91,7 +91,7 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { /// Blend packed double-precision (64-bit) floating-point elements from `a` /// and `b` using control mask `imm2` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))] pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d { @@ -103,7 +103,7 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d { /// Blend packed single-precision (32-bit) floating-point elements from `a` /// and `b` using mask `imm4` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))] pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 { @@ -115,7 +115,7 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 { /// Extract a single-precision (32-bit) floating-point element from `a`, /// selected with `imm8` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] // TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))] @@ -127,7 +127,7 @@ pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 { /// integer containing the zero-extended integer data. /// /// See [LLVM commit D20468][https://reviews.llvm.org/D20468]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))] pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 { @@ -136,7 +136,7 @@ pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 { } /// Extract an 32-bit integer from `a` selected with `imm8` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] // TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))] @@ -167,7 +167,7 @@ pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 { /// /// * Bits `[3:0]`: If any of these bits are set, the corresponding result /// element is cleared. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))] pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { @@ -179,7 +179,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { /// Return a copy of `a` with the 8-bit integer from `i` inserted at a /// location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))] pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i { @@ -188,7 +188,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i { /// Return a copy of `a` with the 32-bit integer from `i` inserted at a /// location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))] pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i { @@ -197,7 +197,7 @@ pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i { /// Compare packed 8-bit integers in `a` and `b` and return packed maximum /// values in dst. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmaxsb))] pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -206,7 +206,7 @@ pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i { /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed /// maximum. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmaxuw))] pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i { @@ -215,7 +215,7 @@ pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i { /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum /// values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmaxsd))] pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -224,7 +224,7 @@ pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i { /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed /// maximum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmaxud))] pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i { @@ -233,7 +233,7 @@ pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i { /// Compare packed 8-bit integers in `a` and `b` and return packed minimum /// values in dst. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pminsb))] pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -242,7 +242,7 @@ pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i { /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed /// minimum. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pminuw))] pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i { @@ -251,7 +251,7 @@ pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i { /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum /// values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pminsd))] pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -260,7 +260,7 @@ pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i { /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed /// minimum values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pminud))] pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { @@ -269,7 +269,7 @@ pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers /// using unsigned saturation -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(packusdw))] pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -277,7 +277,7 @@ pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { } /// Compare packed 64-bit integers in `a` and `b` for equality -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pcmpeqq))] pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i { @@ -285,7 +285,7 @@ pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i { } /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovsxbw))] pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i { @@ -295,7 +295,7 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i { } /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovsxbd))] pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i { @@ -306,7 +306,7 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i { /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed /// 64-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovsxbq))] pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i { @@ -316,7 +316,7 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i { } /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovsxwd))] pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i { @@ -326,7 +326,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i { } /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovsxwq))] pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i { @@ -336,7 +336,7 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i { } /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovsxdq))] pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i { @@ -346,7 +346,7 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i { } /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovzxbw))] pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i { @@ -356,7 +356,7 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i { } /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovzxbd))] pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i { @@ -366,7 +366,7 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i { } /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovzxbq))] pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i { @@ -377,7 +377,7 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i { /// Zero extend packed unsigned 16-bit integers in `a` /// to packed 32-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovzxwd))] pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i { @@ -388,7 +388,7 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i { /// Zero extend packed unsigned 16-bit integers in `a` /// to packed 64-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovzxwq))] pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i { @@ -399,7 +399,7 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i { /// Zero extend packed unsigned 32-bit integers in `a` /// to packed 64-bit integers -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmovzxdq))] pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i { @@ -415,7 +415,7 @@ pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i { /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of /// the dot product will be stored in the return value component. Otherwise if /// the broadcast mask bit is zero then the return component will be zero. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(dppd, imm8 = 0))] pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { @@ -432,7 +432,7 @@ pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of /// the dot product will be stored in the return value component. Otherwise if /// the broadcast mask bit is zero then the return component will be zero. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(dpps, imm8 = 0))] pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { @@ -445,7 +445,7 @@ pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { /// Round the packed double-precision (64-bit) floating-point elements in `a` /// down to an integer value, and store the results as packed double-precision /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundpd))] pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { @@ -455,7 +455,7 @@ pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { /// Round the packed single-precision (32-bit) floating-point elements in `a` /// down to an integer value, and store the results as packed single-precision /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundps))] pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 { @@ -467,7 +467,7 @@ pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 { /// floating-point element in the lower element of the intrinsic result, /// and copy the upper element from `a` to the upper element of the intrinsic /// result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundsd))] pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d { @@ -479,7 +479,7 @@ pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d { /// floating-point element in the lower element of the intrinsic result, /// and copy the upper 3 packed elements from `a` to the upper elements /// of the intrinsic result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundss))] pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { @@ -489,7 +489,7 @@ pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { /// Round the packed double-precision (64-bit) floating-point elements in `a` /// up to an integer value, and store the results as packed double-precision /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundpd))] pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { @@ -499,7 +499,7 @@ pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { /// Round the packed single-precision (32-bit) floating-point elements in `a` /// up to an integer value, and store the results as packed single-precision /// floating-point elements. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundps))] pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 { @@ -511,7 +511,7 @@ pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 { /// floating-point element in the lower element of the intrisic result, /// and copy the upper element from `a` to the upper element /// of the intrinsic result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundsd))] pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d { @@ -523,7 +523,7 @@ pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d { /// floating-point element in the lower element of the intrinsic result, /// and copy the upper 3 packed elements from `a` to the upper elements /// of the intrinsic result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundss))] pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { @@ -549,7 +549,7 @@ pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundpd, rounding = 0))] pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d { @@ -578,7 +578,7 @@ pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d { /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundps, rounding = 0))] pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 { @@ -609,7 +609,7 @@ pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 { /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundsd, rounding = 0))] pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { @@ -640,7 +640,7 @@ pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(roundss, rounding = 0))] pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { @@ -669,7 +669,7 @@ pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { /// * bits `[15:0]` - contain the minimum value found in parameter `a`, /// * bits `[18:16]` - contain the index of the minimum value /// * remaining bits are set to `0`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(phminposuw))] pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { @@ -678,7 +678,7 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { /// Multiply the low 32-bit integers from each packed 64-bit /// element in `a` and `b`, and return the signed 64-bit result. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmuldq))] pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -691,7 +691,7 @@ pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would return a /// negative number. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmulld))] pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -729,7 +729,7 @@ pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i { /// /// * A `__m128i` vector containing the sums of the sets of /// absolute differences between both operands. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))] pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i { diff --git a/library/stdarch/coresimd/src/x86/i586/sse42.rs b/library/stdarch/coresimd/src/x86/i586/sse42.rs index f358426d3113..f850306d2943 100644 --- a/library/stdarch/coresimd/src/x86/i586/sse42.rs +++ b/library/stdarch/coresimd/src/x86/i586/sse42.rs @@ -48,7 +48,7 @@ pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000; /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and return the generated mask. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))] pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i { @@ -258,7 +258,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i { /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html /// [`_mm_cmpestri`]: fn._mm_cmpestri.html -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 { @@ -273,7 +273,7 @@ pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 { /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and return `1` if any character in `b` was null. /// and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 { @@ -288,7 +288,7 @@ pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 { /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and return `1` if the resulting mask was non-zero, /// and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 { @@ -303,7 +303,7 @@ pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 { /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and returns `1` if any character in `a` was null, /// and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 { @@ -317,7 +317,7 @@ pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 { /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and return bit `0` of the resulting bit mask. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 { @@ -332,7 +332,7 @@ pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 { /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and return `1` if `b` did not contain a null /// character and the resulting mask was zero, and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))] pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 { @@ -346,7 +346,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 { /// Compare packed strings in `a` and `b` with lengths `la` and `lb` /// using the control in `imm8`, and return the generated mask. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))] pub unsafe fn _mm_cmpestrm( @@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpestrm( /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html /// [`_mm_cmpistri`]: fn._mm_cmpistri.html -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestri( @@ -456,7 +456,7 @@ pub unsafe fn _mm_cmpestri( /// Compare packed strings in `a` and `b` with lengths `la` and `lb` /// using the control in `imm8`, and return `1` if any character in /// `b` was null, and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestrz( @@ -473,7 +473,7 @@ pub unsafe fn _mm_cmpestrz( /// Compare packed strings in `a` and `b` with lengths `la` and `lb` /// using the control in `imm8`, and return `1` if the resulting mask /// was non-zero, and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestrc( @@ -490,7 +490,7 @@ pub unsafe fn _mm_cmpestrc( /// Compare packed strings in `a` and `b` with lengths `la` and `lb` /// using the control in `imm8`, and return `1` if any character in /// a was null, and `0` otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestrs( @@ -507,7 +507,7 @@ pub unsafe fn _mm_cmpestrs( /// Compare packed strings in `a` and `b` with lengths `la` and `lb` /// using the control in `imm8`, and return bit `0` of the resulting /// bit mask. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestro( @@ -525,7 +525,7 @@ pub unsafe fn _mm_cmpestro( /// using the control in `imm8`, and return `1` if `b` did not /// contain a null character and the resulting mask was zero, and `0` /// otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))] pub unsafe fn _mm_cmpestra( @@ -541,7 +541,7 @@ pub unsafe fn _mm_cmpestra( /// Starting with the initial value in `crc`, return the accumulated /// CRC32 value for unsigned 8-bit integer `v`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(crc32))] pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 { @@ -550,7 +550,7 @@ pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 { /// Starting with the initial value in `crc`, return the accumulated /// CRC32 value for unsigned 16-bit integer `v`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(crc32))] pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 { @@ -559,7 +559,7 @@ pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 { /// Starting with the initial value in `crc`, return the accumulated /// CRC32 value for unsigned 32-bit integer `v`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(crc32))] pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 { diff --git a/library/stdarch/coresimd/src/x86/i586/ssse3.rs b/library/stdarch/coresimd/src/x86/i586/ssse3.rs index f0498ef53176..01e461bd79ed 100644 --- a/library/stdarch/coresimd/src/x86/i586/ssse3.rs +++ b/library/stdarch/coresimd/src/x86/i586/ssse3.rs @@ -11,7 +11,7 @@ use x86::*; /// Compute the absolute value of packed 8-bit signed integers in `a` and /// return the unsigned results. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(pabsb))] pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i { @@ -21,7 +21,7 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i { /// Compute the absolute value of each of the packed 16-bit signed integers in /// `a` and /// return the 16-bit unsigned integer -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(pabsw))] pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i { @@ -31,7 +31,7 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i { /// Compute the absolute value of each of the packed 32-bit signed integers in /// `a` and /// return the 32-bit unsigned integer -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(pabsd))] pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i { @@ -62,7 +62,7 @@ pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i { /// r /// } /// ``` -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(pshufb))] pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -71,7 +71,7 @@ pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i { /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, /// shift the result right by `n` bytes, and return the low 16 bytes. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(palignr, n = 15))] pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i { @@ -129,7 +129,7 @@ pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i { /// Horizontally add the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(phaddw))] pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -139,7 +139,7 @@ pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i { /// Horizontally add the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(phaddsw))] pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -148,7 +148,7 @@ pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i { /// Horizontally add the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [4 x i32]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(phaddd))] pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -157,7 +157,7 @@ pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i { /// Horizontally subtract the adjacent pairs of values contained in 2 /// packed 128-bit vectors of [8 x i16]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(phsubw))] pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -168,7 +168,7 @@ pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i { /// packed 128-bit vectors of [8 x i16]. Positive differences greater than /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are /// saturated to 8000h. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(phsubsw))] pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -177,7 +177,7 @@ pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i { /// Horizontally subtract the adjacent pairs of values contained in 2 /// packed 128-bit vectors of [4 x i32]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(phsubd))] pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i { @@ -189,7 +189,7 @@ pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i { /// integer values contained in the second source operand, add pairs of /// contiguous products with signed saturation, and writes the 16-bit sums to /// the corresponding bits in the destination. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(pmaddubsw))] pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -199,7 +199,7 @@ pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i { /// Multiply packed 16-bit signed integer values, truncate the 32-bit /// product to the 18 most significant bits by right-shifting, round the /// truncated value by adding 1, and write bits [16:1] to the destination. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(pmulhrsw))] pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -210,7 +210,7 @@ pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i { /// integer in `b` is negative, and return the result. /// Elements in result are zeroed out when the corresponding element in `b` /// is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(psignb))] pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i { @@ -221,7 +221,7 @@ pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i { /// integer in `b` is negative, and return the results. /// Elements in result are zeroed out when the corresponding element in `b` /// is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(psignw))] pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i { @@ -232,7 +232,7 @@ pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i { /// integer in `b` is negative, and return the results. /// Element in result are zeroed out when the corresponding element in `b` /// is zero. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3")] #[cfg_attr(test, assert_instr(psignd))] pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i { diff --git a/library/stdarch/coresimd/src/x86/i586/tbm.rs b/library/stdarch/coresimd/src/x86/i586/tbm.rs index 30019673a23b..1a9b48ca29fa 100644 --- a/library/stdarch/coresimd/src/x86/i586/tbm.rs +++ b/library/stdarch/coresimd/src/x86/i586/tbm.rs @@ -27,7 +27,7 @@ extern "C" { /// Extracts bits in range [`start`, `start` + `length`) from `a` into /// the least significant bits of the result. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32)) @@ -35,7 +35,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { /// Extracts bits in range [`start`, `start` + `length`) from `a` into /// the least significant bits of the result. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64)) @@ -46,7 +46,7 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { /// /// Bits [7,0] of `control` specify the index to the first bit in the range to /// be extracted, and bits [15,8] specify the length of the range. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] pub fn _bextr2_u32(a: u32, control: u32) -> u32 { unsafe { x86_tbm_bextri_u32(a, control) } @@ -57,7 +57,7 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 { /// /// Bits [7,0] of `control` specify the index to the first bit in the range to /// be extracted, and bits [15,8] specify the length of the range. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] pub fn _bextr2_u64(a: u64, control: u64) -> u64 { unsafe { x86_tbm_bextri_u64(a, control) } @@ -67,7 +67,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 { /// Clears all bits below the least significant zero bit of `x`. /// /// If there is no zero bit in `x`, it returns zero. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcfill))] pub unsafe fn _blcfill_u32(x: u32) -> u32 { @@ -77,7 +77,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 { /// Clears all bits below the least significant zero bit of `x`. /// /// If there is no zero bit in `x`, it returns zero. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcfill))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -88,7 +88,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 { /// Sets all bits of `x` to 1 except for the least significant zero bit. /// /// If there is no zero bit in `x`, it sets all bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blci))] pub unsafe fn _blci_u32(x: u32) -> u32 { @@ -98,7 +98,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 { /// Sets all bits of `x` to 1 except for the least significant zero bit. /// /// If there is no zero bit in `x`, it sets all bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blci))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -109,7 +109,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 { /// Sets the least significant zero bit of `x` and clears all other bits. /// /// If there is no zero bit in `x`, it returns zero. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcic))] pub unsafe fn _blcic_u32(x: u32) -> u32 { @@ -119,7 +119,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 { /// Sets the least significant zero bit of `x` and clears all other bits. /// /// If there is no zero bit in `x`, it returns zero. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcic))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -131,7 +131,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 { /// that bit. /// /// If there is no zero bit in `x`, it sets all the bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcmsk))] pub unsafe fn _blcmsk_u32(x: u32) -> u32 { @@ -142,7 +142,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 { /// that bit. /// /// If there is no zero bit in `x`, it sets all the bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -153,7 +153,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 { /// Sets the least significant zero bit of `x`. /// /// If there is no zero bit in `x`, it returns `x`. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcs))] pub unsafe fn _blcs_u32(x: u32) -> u32 { @@ -163,7 +163,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 { /// Sets the least significant zero bit of `x`. /// /// If there is no zero bit in `x`, it returns `x`. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blcs))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -174,7 +174,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 { /// Sets all bits of `x` below the least significant one. /// /// If there is no set bit in `x`, it sets all the bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blsfill))] pub unsafe fn _blsfill_u32(x: u32) -> u32 { @@ -184,7 +184,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 { /// Sets all bits of `x` below the least significant one. /// /// If there is no set bit in `x`, it sets all the bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blsfill))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -195,7 +195,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 { /// Clears least significant bit and sets all other bits. /// /// If there is no set bit in `x`, it sets all the bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blsic))] pub unsafe fn _blsic_u32(x: u32) -> u32 { @@ -205,7 +205,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 { /// Clears least significant bit and sets all other bits. /// /// If there is no set bit in `x`, it sets all the bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(blsic))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -217,7 +217,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 { /// bits. /// /// If the least significant bit of `x` is 0, it sets all bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(t1mskc))] pub unsafe fn _t1mskc_u32(x: u32) -> u32 { @@ -228,7 +228,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 { /// bits. /// /// If the least significant bit of `x` is 0, it sets all bits. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(t1mskc))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -240,7 +240,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 { /// bits. /// /// If the least significant bit of `x` is 1, it returns zero. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(tzmsk))] pub unsafe fn _tzmsk_u32(x: u32) -> u32 { @@ -251,7 +251,7 @@ pub unsafe fn _tzmsk_u32(x: u32) -> u32 { /// bits. /// /// If the least significant bit of `x` is 1, it returns zero. -#[inline(always)] +#[inline] #[target_feature(enable = "tbm")] #[cfg_attr(test, assert_instr(tzmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions diff --git a/library/stdarch/coresimd/src/x86/i586/xsave.rs b/library/stdarch/coresimd/src/x86/i586/xsave.rs index 9a7611a82efc..ead6cd09f7e9 100644 --- a/library/stdarch/coresimd/src/x86/i586/xsave.rs +++ b/library/stdarch/coresimd/src/x86/i586/xsave.rs @@ -33,7 +33,7 @@ extern "C" { /// /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave")] #[cfg_attr(test, assert_instr(xsave))] pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) { @@ -46,7 +46,7 @@ pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) { /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte /// boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave")] #[cfg_attr(test, assert_instr(xrstor))] pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) { @@ -62,7 +62,7 @@ const _XCR_XFEATURE_ENABLED_MASK: u32 = 0; /// by `a`. /// /// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave")] #[cfg_attr(test, assert_instr(xsetbv))] pub unsafe fn _xsetbv(a: u32, val: u64) { @@ -71,7 +71,7 @@ pub unsafe fn _xsetbv(a: u32, val: u64) { /// Reads the contents of the extended control register `XCR` /// specified in `xcr_no`. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave")] #[cfg_attr(test, assert_instr(xgetbv))] pub unsafe fn _xgetbv(xcr_no: u32) -> u64 { @@ -85,7 +85,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 { /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize /// the manner in which data is saved. The performance of this instruction will /// be equal to or better than using the `XSAVE` instruction. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsaveopt")] #[cfg_attr(test, assert_instr(xsaveopt))] pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) { @@ -98,7 +98,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) { /// `xsavec` differs from `xsave` in that it uses compaction and that it may /// use init optimization. State is saved based on bits [62:0] in `save_mask` /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsavec")] #[cfg_attr(test, assert_instr(xsavec))] pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) { @@ -112,7 +112,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) { /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the /// modified optimization. State is saved based on bits [62:0] in `save_mask` /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsaves")] #[cfg_attr(test, assert_instr(xsaves))] pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) { @@ -128,7 +128,7 @@ pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) { /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte /// boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsaves")] #[cfg_attr(test, assert_instr(xrstors))] pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) { diff --git a/library/stdarch/coresimd/src/x86/i686/mmx.rs b/library/stdarch/coresimd/src/x86/i686/mmx.rs index e013fb3dc62a..c5d69bcec034 100644 --- a/library/stdarch/coresimd/src/x86/i686/mmx.rs +++ b/library/stdarch/coresimd/src/x86/i686/mmx.rs @@ -16,7 +16,7 @@ use core::mem; use stdsimd_test::assert_instr; /// Constructs a 64-bit integer vector initialized to zero. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] // FIXME: this produces a movl instead of xorps on x86 // FIXME: this produces a xor intrinsic instead of xorps on x86_64 @@ -26,7 +26,7 @@ pub unsafe fn _mm_setzero_si64() -> __m64 { } /// Add packed 8-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddb))] pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 { @@ -34,7 +34,7 @@ pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 { } /// Add packed 8-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddb))] pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 { @@ -42,7 +42,7 @@ pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 { } /// Add packed 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddw))] pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 { @@ -50,7 +50,7 @@ pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 { } /// Add packed 16-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddw))] pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 { @@ -58,7 +58,7 @@ pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 { } /// Add packed 32-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddd))] pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 { @@ -66,7 +66,7 @@ pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 { } /// Add packed 32-bit integers in `a` and `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddd))] pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 { @@ -74,7 +74,7 @@ pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 { } /// Add packed 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddsb))] pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 { @@ -82,7 +82,7 @@ pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 { } /// Add packed 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddsb))] pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 { @@ -90,7 +90,7 @@ pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 { } /// Add packed 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddsw))] pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 { @@ -98,7 +98,7 @@ pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 { } /// Add packed 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddsw))] pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 { @@ -106,7 +106,7 @@ pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 { } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddusb))] pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 { @@ -114,7 +114,7 @@ pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 { } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddusb))] pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 { @@ -122,7 +122,7 @@ pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 { } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddusw))] pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 { @@ -130,7 +130,7 @@ pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 { } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(paddusw))] pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 { @@ -138,7 +138,7 @@ pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 { } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubb))] pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 { @@ -146,7 +146,7 @@ pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 { } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubb))] pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 { @@ -154,7 +154,7 @@ pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 { } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubw))] pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 { @@ -162,7 +162,7 @@ pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 { } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubw))] pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 { @@ -170,7 +170,7 @@ pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 { } /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubd))] pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 { @@ -178,7 +178,7 @@ pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 { } /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubd))] pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 { @@ -187,7 +187,7 @@ pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 { /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubsb))] pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 { @@ -196,7 +196,7 @@ pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 { /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubsb))] pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 { @@ -205,7 +205,7 @@ pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 { /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubsw))] pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 { @@ -214,7 +214,7 @@ pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 { /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` /// using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubsw))] pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 { @@ -223,7 +223,7 @@ pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 { /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit /// integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubusb))] pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 { @@ -232,7 +232,7 @@ pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 { /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit /// integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubusb))] pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 { @@ -241,7 +241,7 @@ pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 { /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned /// 16-bit integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubusw))] pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 { @@ -250,7 +250,7 @@ pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 { /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned /// 16-bit integers in `a` using saturation. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(psubusw))] pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 { @@ -262,7 +262,7 @@ pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 { /// /// Positive values greater than 0x7F are saturated to 0x7F. Negative values /// less than 0x80 are saturated to 0x80. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(packsswb))] pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 { @@ -274,7 +274,7 @@ pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 { /// /// Positive values greater than 0x7F are saturated to 0x7F. Negative values /// less than 0x80 are saturated to 0x80. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(packssdw))] pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 { @@ -283,7 +283,7 @@ pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 { /// Compares whether each element of `a` is greater than the corresponding /// element of `b` returning `0` for `false` and `-1` for `true`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(pcmpgtb))] pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 { @@ -292,7 +292,7 @@ pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 { /// Compares whether each element of `a` is greater than the corresponding /// element of `b` returning `0` for `false` and `-1` for `true`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(pcmpgtw))] pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 { @@ -301,7 +301,7 @@ pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 { /// Compares whether each element of `a` is greater than the corresponding /// element of `b` returning `0` for `false` and `-1` for `true`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(pcmpgtd))] pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 { @@ -310,7 +310,7 @@ pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 { /// Unpacks the upper two elements from two `i16x4` vectors and interleaves /// them into the result: `[a.2, b.2, a.3, b.3]`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 { @@ -319,7 +319,7 @@ pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 { /// Unpacks the upper four elements from two `i8x8` vectors and interleaves /// them into the result: `[a.4, b.4, a.5, b.5, a.6, b.6, a.7, b.7]`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(punpckhbw))] pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 { @@ -328,7 +328,7 @@ pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 { /// Unpacks the lower four elements from two `i8x8` vectors and interleaves /// them into the result: `[a.0, b.0, a.1, b.1, a.2, b.2, a.3, b.3]`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(punpcklbw))] pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 { @@ -337,7 +337,7 @@ pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 { /// Unpacks the lower two elements from two `i16x4` vectors and interleaves /// them into the result: `[a.0 b.0 a.1 b.1]`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(punpcklwd))] pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 { @@ -346,7 +346,7 @@ pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 { /// Unpacks the upper element from two `i32x2` vectors and interleaves them /// into the result: `[a.1, b.1]`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(punpckhdq))] pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 { @@ -355,7 +355,7 @@ pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 { /// Unpacks the lower element from two `i32x2` vectors and interleaves them /// into the result: `[a.0, b.0]`. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] #[cfg_attr(test, assert_instr(punpckldq))] pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 { @@ -363,21 +363,21 @@ pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 { } /// Set packed 16-bit integers in dst with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set_pi16(e3: i16, e2: i16, e1: i16, e0: i16) -> __m64 { _mm_setr_pi16(e0, e1, e2, e3) } /// Set packed 32-bit integers in dst with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 { _mm_setr_pi32(e0, e1) } /// Set packed 8-bit integers in dst with the supplied values. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set_pi8( e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8 @@ -386,21 +386,21 @@ pub unsafe fn _mm_set_pi8( } /// Broadcast 16-bit integer a to all all elements of dst. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set1_pi16(a: i16) -> __m64 { _mm_setr_pi16(a, a, a, a) } /// Broadcast 32-bit integer a to all all elements of dst. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set1_pi32(a: i32) -> __m64 { _mm_setr_pi32(a, a) } /// Broadcast 8-bit integer a to all all elements of dst. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 { _mm_setr_pi8(a, a, a, a, a, a, a, a) @@ -408,7 +408,7 @@ pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 { /// Set packed 16-bit integers in dst with the supplied values in reverse /// order. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 { mem::transmute(i16x4::new(e0, e1, e2, e3)) @@ -416,14 +416,14 @@ pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 { /// Set packed 32-bit integers in dst with the supplied values in reverse /// order. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 { mem::transmute(i32x2::new(e0, e1)) } /// Set packed 8-bit integers in dst with the supplied values in reverse order. -#[inline(always)] +#[inline] #[target_feature(enable = "mmx")] pub unsafe fn _mm_setr_pi8( e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8 diff --git a/library/stdarch/coresimd/src/x86/i686/sse2.rs b/library/stdarch/coresimd/src/x86/i686/sse2.rs index f82c4372a963..ba34838c0326 100644 --- a/library/stdarch/coresimd/src/x86/i686/sse2.rs +++ b/library/stdarch/coresimd/src/x86/i686/sse2.rs @@ -10,7 +10,7 @@ use stdsimd_test::assert_instr; /// Adds two signed or unsigned 64-bit integer values, returning the /// lower 64 bits of the sum. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(paddq))] pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 { @@ -20,7 +20,7 @@ pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 { /// Multiplies 32-bit unsigned integer values contained in the lower bits /// of the two 64-bit integer vectors and returns the 64-bit unsigned /// product. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(pmuludq))] pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 { @@ -29,7 +29,7 @@ pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 { /// Subtracts signed or unsigned 64-bit integer values and writes the /// difference to the corresponding bits in the destination. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(psubq))] pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 { @@ -39,7 +39,7 @@ pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 { /// Converts the two signed 32-bit integer elements of a 64-bit vector of /// [2 x i32] into two double-precision floating-point values, returned in a /// 128-bit vector of [2 x double]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(cvtpi2pd))] pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d { @@ -48,7 +48,7 @@ pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d { /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with /// the specified 64-bit integer values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] // no particular instruction to test pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i { @@ -57,7 +57,7 @@ pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i { /// Initializes both values in a 128-bit vector of [2 x i64] with the /// specified 64-bit value. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] // no particular instruction to test pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i { @@ -66,7 +66,7 @@ pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i { /// Constructs a 128-bit integer vector, initialized in reverse order /// with the specified 64-bit integral values. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] // no particular instruction to test pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i { @@ -75,7 +75,7 @@ pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i { /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit /// integer. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] // #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong // instr? @@ -85,7 +85,7 @@ pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 { /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the /// upper bits. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] // #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong // instr? @@ -96,7 +96,7 @@ pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i { /// Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, /// returned in a 64-bit vector of [2 x i32]. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(cvtpd2pi))] pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 { @@ -108,7 +108,7 @@ pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 { /// returned in a 64-bit vector of [2 x i32]. /// If the result of either conversion is inexact, the result is truncated /// (rounded towards zero) regardless of the current MXCSR setting. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(cvttpd2pi))] pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 { diff --git a/library/stdarch/coresimd/src/x86/i686/sse41.rs b/library/stdarch/coresimd/src/x86/i686/sse41.rs index a8dd65cfe021..3f35305b1874 100644 --- a/library/stdarch/coresimd/src/x86/i686/sse41.rs +++ b/library/stdarch/coresimd/src/x86/i686/sse41.rs @@ -29,7 +29,7 @@ extern "C" { /// /// * `1` - if the specified bits are all zeros, /// * `0` - otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(ptest))] pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { @@ -49,7 +49,7 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { /// /// * `1` - if the specified bits are all ones, /// * `0` - otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(ptest))] pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { @@ -69,7 +69,7 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { /// /// * `1` - if the specified bits are neither all zeros nor all ones, /// * `0` - otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(ptest))] pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { @@ -89,7 +89,7 @@ pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { /// /// * `1` - if the specified bits are all zeros, /// * `0` - otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(ptest))] pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 { @@ -107,7 +107,7 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 { /// /// * `1` - if the bits specified in the operand are all set to 1, /// * `0` - otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pcmpeqd))] #[cfg_attr(test, assert_instr(ptest))] @@ -128,7 +128,7 @@ pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 { /// /// * `1` - if the specified bits are neither all zeros nor all ones, /// * `0` - otherwise. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(ptest))] pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { diff --git a/library/stdarch/coresimd/src/x86/i686/sse42.rs b/library/stdarch/coresimd/src/x86/i686/sse42.rs index 301dd3ea77bc..f092fe412f9c 100644 --- a/library/stdarch/coresimd/src/x86/i686/sse42.rs +++ b/library/stdarch/coresimd/src/x86/i686/sse42.rs @@ -9,7 +9,7 @@ use stdsimd_test::assert_instr; /// Compare packed 64-bit integers in `a` and `b` for greater-than, /// return the results. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(pcmpgtq))] pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i { diff --git a/library/stdarch/coresimd/src/x86/i686/sse4a.rs b/library/stdarch/coresimd/src/x86/i686/sse4a.rs index f35ffb3e501a..5e226322f8c5 100644 --- a/library/stdarch/coresimd/src/x86/i686/sse4a.rs +++ b/library/stdarch/coresimd/src/x86/i686/sse4a.rs @@ -33,7 +33,7 @@ extern "C" { /// /// If `length == 0 && index > 0` or `lenght + index > 64` the result is /// undefined. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4a")] #[cfg_attr(test, assert_instr(extrq))] pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { @@ -49,7 +49,7 @@ pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { /// /// If the `length` is zero it is interpreted as `64`. If `index + length > 64` /// or `index > 0 && length == 0` the result is undefined. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4a")] #[cfg_attr(test, assert_instr(insertq))] pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { @@ -57,7 +57,7 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { } /// Non-temporal store of `a.0` into `p`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4a")] #[cfg_attr(test, assert_instr(movntsd))] pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { @@ -65,7 +65,7 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { } /// Non-temporal store of `a.0` into `p`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4a")] #[cfg_attr(test, assert_instr(movntss))] pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { diff --git a/library/stdarch/coresimd/src/x86/i686/ssse3.rs b/library/stdarch/coresimd/src/x86/i686/ssse3.rs index 647074096cdf..c386d8a0a4ad 100644 --- a/library/stdarch/coresimd/src/x86/i686/ssse3.rs +++ b/library/stdarch/coresimd/src/x86/i686/ssse3.rs @@ -7,7 +7,7 @@ use x86::*; /// Compute the absolute value of packed 8-bit integers in `a` and /// return the unsigned results. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(pabsb))] pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 { @@ -16,7 +16,7 @@ pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 { /// Compute the absolute value of packed 8-bit integers in `a`, and return the /// unsigned results. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(pabsw))] pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 { @@ -25,7 +25,7 @@ pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 { /// Compute the absolute value of packed 32-bit integers in `a`, and return the /// unsigned results. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(pabsd))] pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 { @@ -34,7 +34,7 @@ pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 { /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in /// the corresponding 8-bit element of `b`, and return the results -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(pshufb))] pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 { @@ -43,7 +43,7 @@ pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 { /// Concatenates the two 64-bit integer vector operands, and right-shifts /// the result by the number of bytes specified in the immediate operand. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(palignr, n = 15))] pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 { @@ -57,7 +57,7 @@ pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 { /// Horizontally add the adjacent pairs of values contained in 2 packed /// 64-bit vectors of [4 x i16]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(phaddw))] pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 { @@ -66,7 +66,7 @@ pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 { /// Horizontally add the adjacent pairs of values contained in 2 packed /// 64-bit vectors of [2 x i32]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(phaddd))] pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 { @@ -76,7 +76,7 @@ pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 { /// Horizontally add the adjacent pairs of values contained in 2 packed /// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(phaddsw))] pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 { @@ -85,7 +85,7 @@ pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 { /// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 64-bit vectors of [4 x i16]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(phsubw))] pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 { @@ -94,7 +94,7 @@ pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 { /// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 64-bit vectors of [2 x i32]. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(phsubd))] pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 { @@ -105,7 +105,7 @@ pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 { /// packed 64-bit vectors of [4 x i16]. Positive differences greater than /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are /// saturated to 8000h. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(phsubsw))] pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 { @@ -117,7 +117,7 @@ pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 { /// integer values contained in the second source operand, adds pairs of /// contiguous products with signed saturation, and writes the 16-bit sums to /// the corresponding bits in the destination. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(pmaddubsw))] pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 { @@ -127,7 +127,7 @@ pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 { /// Multiplies packed 16-bit signed integer values, truncates the 32-bit /// products to the 18 most significant bits by right-shifting, rounds the /// truncated value by adding 1, and writes bits [16:1] to the destination. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(pmulhrsw))] pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 { @@ -138,7 +138,7 @@ pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 { /// integer in `b` is negative, and return the results. /// Element in result are zeroed out when the corresponding element in `b` is /// zero. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(psignb))] pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 { @@ -149,7 +149,7 @@ pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 { /// integer in `b` is negative, and return the results. /// Element in result are zeroed out when the corresponding element in `b` is /// zero. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(psignw))] pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 { @@ -160,7 +160,7 @@ pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 { /// integer in `b` is negative, and return the results. /// Element in result are zeroed out when the corresponding element in `b` is /// zero. -#[inline(always)] +#[inline] #[target_feature(enable = "ssse3,mmx")] #[cfg_attr(test, assert_instr(psignd))] pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 { diff --git a/library/stdarch/coresimd/src/x86/mod.rs b/library/stdarch/coresimd/src/x86/mod.rs index 05e99a9b9c62..d1f42f6cc919 100644 --- a/library/stdarch/coresimd/src/x86/mod.rs +++ b/library/stdarch/coresimd/src/x86/mod.rs @@ -17,7 +17,7 @@ macro_rules! types { pub struct $name($($fields)*); impl Clone for $name { - #[inline(always)] // currently needed for correctness + #[inline] // currently needed for correctness fn clone(&self) -> $name { *self } @@ -307,49 +307,49 @@ pub use self::test::*; trait m128iExt: Sized { fn as_m128i(self) -> __m128i; - #[inline(always)] + #[inline] fn as_u8x16(self) -> ::v128::u8x16 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_u16x8(self) -> ::v128::u16x8 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_u32x4(self) -> ::v128::u32x4 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_u64x2(self) -> ::v128::u64x2 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_i8x16(self) -> ::v128::i8x16 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_i16x8(self) -> ::v128::i16x8 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_i32x4(self) -> ::v128::i32x4 { unsafe { mem::transmute(self.as_m128i()) } } - #[inline(always)] + #[inline] fn as_i64x2(self) -> ::v128::i64x2 { unsafe { mem::transmute(self.as_m128i()) } } } impl m128iExt for __m128i { - #[inline(always)] + #[inline] fn as_m128i(self) -> __m128i { self } } @@ -358,49 +358,49 @@ impl m128iExt for __m128i { trait m256iExt: Sized { fn as_m256i(self) -> __m256i; - #[inline(always)] + #[inline] fn as_u8x32(self) -> ::v256::u8x32 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_u16x16(self) -> ::v256::u16x16 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_u32x8(self) -> ::v256::u32x8 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_u64x4(self) -> ::v256::u64x4 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_i8x32(self) -> ::v256::i8x32 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_i16x16(self) -> ::v256::i16x16 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_i32x8(self) -> ::v256::i32x8 { unsafe { mem::transmute(self.as_m256i()) } } - #[inline(always)] + #[inline] fn as_i64x4(self) -> ::v256::i64x4 { unsafe { mem::transmute(self.as_m256i()) } } } impl m256iExt for __m256i { - #[inline(always)] + #[inline] fn as_m256i(self) -> __m256i { self } } diff --git a/library/stdarch/coresimd/src/x86/x86_64/abm.rs b/library/stdarch/coresimd/src/x86/x86_64/abm.rs index 988950104854..235fa8bb1719 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/abm.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/abm.rs @@ -4,7 +4,7 @@ use stdsimd_test::assert_instr; /// Counts the leading most significant zero bits. /// /// When the operand is zero, it returns its size in bits. -#[inline(always)] +#[inline] #[target_feature(enable = "lzcnt")] #[cfg_attr(test, assert_instr(lzcnt))] pub unsafe fn _lzcnt_u64(x: u64) -> u64 { @@ -12,7 +12,7 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 { } /// Counts the bits that are set. -#[inline(always)] +#[inline] #[target_feature(enable = "popcnt")] #[cfg_attr(test, assert_instr(popcnt))] pub unsafe fn _popcnt64(x: i64) -> i32 { diff --git a/library/stdarch/coresimd/src/x86/x86_64/avx.rs b/library/stdarch/coresimd/src/x86/x86_64/avx.rs index ae92ccd71158..3f9fda1451f2 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/avx.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/avx.rs @@ -5,7 +5,7 @@ use x86::*; /// Copy `a` to result, and insert the 64-bit integer `i` into result /// at the location specified by `index`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i { diff --git a/library/stdarch/coresimd/src/x86/x86_64/avx2.rs b/library/stdarch/coresimd/src/x86/x86_64/avx2.rs index 840569e7c960..6cdb542bfb95 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/avx2.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/avx2.rs @@ -2,7 +2,7 @@ use simd_llvm::*; use x86::*; /// Extract a 64-bit integer from `a`, selected with `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 { diff --git a/library/stdarch/coresimd/src/x86/x86_64/bmi.rs b/library/stdarch/coresimd/src/x86/x86_64/bmi.rs index cda1daa7e710..d8d8a749723b 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/bmi.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/bmi.rs @@ -3,7 +3,7 @@ use stdsimd_test::assert_instr; /// Extracts bits in range [`start`, `start` + `length`) from `a` into /// the least significant bits of the result. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(bextr))] #[cfg(not(target_arch = "x86"))] @@ -16,7 +16,7 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 { /// /// Bits [7,0] of `control` specify the index to the first bit in the range to /// be extracted, and bits [15,8] specify the length of the range. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(bextr))] #[cfg(not(target_arch = "x86"))] @@ -25,7 +25,7 @@ pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 { } /// Bitwise logical `AND` of inverted `a` with `b`. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(andn))] pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 { @@ -33,7 +33,7 @@ pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 { } /// Extract lowest set isolated bit. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(blsi))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -42,7 +42,7 @@ pub unsafe fn _blsi_u64(x: u64) -> u64 { } /// Get mask up to lowest set bit. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(blsmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -53,7 +53,7 @@ pub unsafe fn _blsmsk_u64(x: u64) -> u64 { /// Resets the lowest set bit of `x`. /// /// If `x` is sets CF. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(blsr))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions @@ -64,7 +64,7 @@ pub unsafe fn _blsr_u64(x: u64) -> u64 { /// Counts the number of trailing least significant zero bits. /// /// When the source operand is 0, it returns its size in bits. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(tzcnt))] pub unsafe fn _tzcnt_u64(x: u64) -> u64 { @@ -74,7 +74,7 @@ pub unsafe fn _tzcnt_u64(x: u64) -> u64 { /// Counts the number of trailing least significant zero bits. /// /// When the source operand is 0, it returns its size in bits. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi")] #[cfg_attr(test, assert_instr(tzcnt))] pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 { diff --git a/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs b/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs index 761fa5fec107..b1c74d15c888 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs @@ -5,7 +5,7 @@ use stdsimd_test::assert_instr; /// /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with /// the low half and the high half of the result. -#[inline(always)] +#[inline] #[cfg_attr(test, assert_instr(mulx))] #[target_feature(enable = "bmi2")] #[cfg(not(target_arch = "x86"))] // calls an intrinsic @@ -16,7 +16,7 @@ pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 { } /// Zero higher bits of `a` >= `index`. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi2")] #[cfg_attr(test, assert_instr(bzhi))] #[cfg(not(target_arch = "x86"))] @@ -26,7 +26,7 @@ pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 { /// Scatter contiguous low order bits of `a` to the result at the positions /// specified by the `mask`. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi2")] #[cfg_attr(test, assert_instr(pdep))] #[cfg(not(target_arch = "x86"))] @@ -36,7 +36,7 @@ pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 { /// Gathers the bits of `x` specified by the `mask` into the contiguous low /// order bit positions of the result. -#[inline(always)] +#[inline] #[target_feature(enable = "bmi2")] #[cfg_attr(test, assert_instr(pext))] #[cfg(not(target_arch = "x86"))] diff --git a/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs b/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs index c2a7391a2b0e..d717db15dc8d 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs @@ -21,7 +21,7 @@ extern "C" { /// /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html -#[inline(always)] +#[inline] #[target_feature(enable = "fxsr")] #[cfg_attr(test, assert_instr(fxsave64))] pub unsafe fn _fxsave64(mem_addr: *mut u8) { @@ -42,7 +42,7 @@ pub unsafe fn _fxsave64(mem_addr: *mut u8) { /// /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html -#[inline(always)] +#[inline] #[target_feature(enable = "fxsr")] #[cfg_attr(test, assert_instr(fxrstor64))] pub unsafe fn _fxrstor64(mem_addr: *const u8) { diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse.rs b/library/stdarch/coresimd/src/x86/x86_64/sse.rs index ff7929afc998..4763e81b6f8a 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/sse.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/sse.rs @@ -24,7 +24,7 @@ extern "C" { /// [`_mm_setcsr`](fn._mm_setcsr.html)). /// /// This corresponds to the `CVTSS2SI` instruction (with 64 bit output). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvtss2si))] pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 { @@ -40,7 +40,7 @@ pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 { /// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). /// /// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvttss2si))] pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 { @@ -52,7 +52,7 @@ pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 { /// /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit /// input). -#[inline(always)] +#[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(cvtsi2ss))] pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 { diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse2.rs b/library/stdarch/coresimd/src/x86/x86_64/sse2.rs index d99a39115740..ff16d1f957af 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/sse2.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/sse2.rs @@ -16,7 +16,7 @@ extern "C" { /// Convert the lower double-precision (64-bit) floating-point element in a to /// a 64-bit integer. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsd2si))] pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 { @@ -24,7 +24,7 @@ pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 { } /// Alias for [`_mm_cvtsd_si64`](fn._mm_cvtsd_si64_ss.html). -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsd2si))] pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 { @@ -33,7 +33,7 @@ pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 { /// Convert the lower double-precision (64-bit) floating-point element in `a` /// to a 64-bit integer with truncation. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvttsd2si))] pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 { @@ -41,7 +41,7 @@ pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 { } /// Alias for [`_mm_cvttsd_si64`](fn._mm_cvttsd_si64_ss.html). -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvttsd2si))] pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 { @@ -51,7 +51,7 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 { /// Stores a 64-bit integer value in the specified memory location. /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(movnti))] pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { @@ -60,7 +60,7 @@ pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { /// Return a vector whose lowest element is `a` and all higher elements are /// `0`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movq))] pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i { @@ -69,7 +69,7 @@ pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i { /// Return a vector whose lowest element is `a` and all higher elements are /// `0`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movq))] pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i { @@ -77,7 +77,7 @@ pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i { } /// Return the lowest element of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movq))] pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 { @@ -85,7 +85,7 @@ pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 { } /// Return the lowest element of `a`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(all(test, not(windows)), assert_instr(movq))] pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 { @@ -94,7 +94,7 @@ pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 { /// Return `a` with its lower element replaced by `b` after converting it to /// an `f64`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsi2sd))] pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d { @@ -103,7 +103,7 @@ pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d { /// Return `a` with its lower element replaced by `b` after converting it to /// an `f64`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(cvtsi2sd))] pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d { diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse41.rs b/library/stdarch/coresimd/src/x86/x86_64/sse41.rs index a7f25a4ae324..2747ad44718d 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/sse41.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/sse41.rs @@ -9,7 +9,7 @@ use simd_llvm::*; use stdsimd_test::assert_instr; /// Extract an 64-bit integer from `a` selected with `imm8` -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] // TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))] @@ -20,7 +20,7 @@ pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 { /// Return a copy of `a` with the 64-bit integer from `i` inserted at a /// location specified by `imm8`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))] pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i { diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse42.rs b/library/stdarch/coresimd/src/x86/x86_64/sse42.rs index 12fd87ea2b1a..6fe79ea8c01c 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/sse42.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/sse42.rs @@ -11,7 +11,7 @@ extern "C" { /// Starting with the initial value in `crc`, return the accumulated /// CRC32 value for unsigned 64-bit integer `v`. -#[inline(always)] +#[inline] #[target_feature(enable = "sse4.2")] #[cfg_attr(test, assert_instr(crc32))] pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 { diff --git a/library/stdarch/coresimd/src/x86/x86_64/xsave.rs b/library/stdarch/coresimd/src/x86/x86_64/xsave.rs index fc8b38ced6d4..0ddd8b1476b4 100644 --- a/library/stdarch/coresimd/src/x86/x86_64/xsave.rs +++ b/library/stdarch/coresimd/src/x86/x86_64/xsave.rs @@ -29,7 +29,7 @@ extern "C" { /// /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave")] #[cfg_attr(test, assert_instr(xsave64))] pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) { @@ -42,7 +42,7 @@ pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) { /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte /// boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave")] #[cfg_attr(test, assert_instr(xrstor64))] pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) { @@ -56,7 +56,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) { /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize /// the manner in which data is saved. The performance of this instruction will /// be equal to or better than using the `XSAVE64` instruction. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsaveopt")] #[cfg_attr(test, assert_instr(xsaveopt64))] pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) { @@ -69,7 +69,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) { /// `xsavec` differs from `xsave` in that it uses compaction and that it may /// use init optimization. State is saved based on bits [62:0] in `save_mask` /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsavec")] #[cfg_attr(test, assert_instr(xsavec64))] pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) { @@ -83,7 +83,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) { /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the /// modified optimization. State is saved based on bits [62:0] in `save_mask` /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsaves")] #[cfg_attr(test, assert_instr(xsaves64))] pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) { @@ -99,7 +99,7 @@ pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) { /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte /// boundary. -#[inline(always)] +#[inline] #[target_feature(enable = "xsave,xsaves")] #[cfg_attr(test, assert_instr(xrstors64))] pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {