diff --git a/library/stdarch/coresimd/src/aarch64/neon.rs b/library/stdarch/coresimd/src/aarch64/neon.rs
index 353a5987c725..047fe2c4a7c4 100644
--- a/library/stdarch/coresimd/src/aarch64/neon.rs
+++ b/library/stdarch/coresimd/src/aarch64/neon.rs
@@ -8,7 +8,7 @@ use simd_llvm::simd_add;
 use v128::f64x2;
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 {
@@ -16,7 +16,7 @@ pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 {
@@ -24,7 +24,7 @@ pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
@@ -32,7 +32,7 @@ pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
diff --git a/library/stdarch/coresimd/src/aarch64/v8.rs b/library/stdarch/coresimd/src/aarch64/v8.rs
index 55a352bcd8a3..e9c00338bd36 100644
--- a/library/stdarch/coresimd/src/aarch64/v8.rs
+++ b/library/stdarch/coresimd/src/aarch64/v8.rs
@@ -9,14 +9,14 @@
 use stdsimd_test::assert_instr;
 
 /// Reverse the order of the bytes.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rev))]
 pub unsafe fn _rev_u64(x: u64) -> u64 {
     x.swap_bytes() as u64
 }
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u64(x: u64) -> u64 {
     x.leading_zeros() as u64
@@ -29,7 +29,7 @@ extern "C" {
 }
 
 /// Reverse the bit order.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rbit))]
 pub unsafe fn _rbit_u64(x: u64) -> u64 {
     rbit_u64(x as i64) as u64
@@ -39,7 +39,7 @@ pub unsafe fn _rbit_u64(x: u64) -> u64 {
 ///
 /// When all bits of the operand are set it returns the size of the operand in
 /// bits.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cls))]
 pub unsafe fn _cls_u32(x: u32) -> u32 {
     u32::leading_zeros((((((x as i32) >> 31) as u32) ^ x) << 1) | 1) as u32
@@ -49,7 +49,7 @@ pub unsafe fn _cls_u32(x: u32) -> u32 {
 ///
 /// When all bits of the operand are set it returns the size of the operand in
 /// bits.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cls))]
 pub unsafe fn _cls_u64(x: u64) -> u64 {
     u64::leading_zeros((((((x as i64) >> 63) as u64) ^ x) << 1) | 1) as u64
diff --git a/library/stdarch/coresimd/src/arm/neon.rs b/library/stdarch/coresimd/src/arm/neon.rs
index 0c4efae29f4a..858594ccd497 100644
--- a/library/stdarch/coresimd/src/arm/neon.rs
+++ b/library/stdarch/coresimd/src/arm/neon.rs
@@ -9,7 +9,7 @@ use v64::{f32x2, i16x4, i32x2, i8x8, u16x4, u32x2, u8x8};
 use v128::{f32x4, i16x8, i32x4, i64x2, i8x16, u16x8, u32x4, u64x2, u8x16};
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 {
@@ -17,7 +17,7 @@ pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 {
@@ -25,7 +25,7 @@ pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 {
@@ -33,7 +33,7 @@ pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 {
@@ -41,7 +41,7 @@ pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 {
@@ -49,7 +49,7 @@ pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 {
@@ -57,7 +57,7 @@ pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 {
@@ -65,7 +65,7 @@ pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 {
@@ -73,7 +73,7 @@ pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 {
@@ -81,7 +81,7 @@ pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 {
@@ -89,7 +89,7 @@ pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 {
@@ -97,7 +97,7 @@ pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 {
@@ -105,7 +105,7 @@ pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 {
@@ -113,7 +113,7 @@ pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 {
@@ -121,7 +121,7 @@ pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 {
@@ -129,7 +129,7 @@ pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 {
@@ -137,7 +137,7 @@ pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
@@ -147,7 +147,7 @@ pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
@@ -157,7 +157,7 @@ pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
@@ -167,7 +167,7 @@ pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
@@ -177,7 +177,7 @@ pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
@@ -187,7 +187,7 @@ pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 {
@@ -205,7 +205,7 @@ extern "C" {
 }
 
 /// Reciprocal square-root estimate.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(frsqrte))]
 pub unsafe fn vrsqrte_f32(a: f32x2) -> f32x2 {
diff --git a/library/stdarch/coresimd/src/arm/v6.rs b/library/stdarch/coresimd/src/arm/v6.rs
index 33fdda67e940..c2011fba7857 100644
--- a/library/stdarch/coresimd/src/arm/v6.rs
+++ b/library/stdarch/coresimd/src/arm/v6.rs
@@ -10,14 +10,14 @@
 use stdsimd_test::assert_instr;
 
 /// Reverse the order of the bytes.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rev))]
 pub unsafe fn _rev_u16(x: u16) -> u16 {
     x.swap_bytes() as u16
 }
 
 /// Reverse the order of the bytes.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rev))]
 pub unsafe fn _rev_u32(x: u32) -> u32 {
     x.swap_bytes() as u32
diff --git a/library/stdarch/coresimd/src/arm/v7.rs b/library/stdarch/coresimd/src/arm/v7.rs
index b62001311430..f8a735f157b1 100644
--- a/library/stdarch/coresimd/src/arm/v7.rs
+++ b/library/stdarch/coresimd/src/arm/v7.rs
@@ -13,28 +13,28 @@ pub use super::v6::*;
 use stdsimd_test::assert_instr;
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u8(x: u8) -> u8 {
     x.leading_zeros() as u8
 }
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u16(x: u16) -> u16 {
     x.leading_zeros() as u16
 }
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u32(x: u32) -> u32 {
     x.leading_zeros() as u32
 }
 
 /// Reverse the bit order.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rbit))]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc
diff --git a/library/stdarch/coresimd/src/nvptx/mod.rs b/library/stdarch/coresimd/src/nvptx/mod.rs
index 21248e31938a..8a444b43aefc 100644
--- a/library/stdarch/coresimd/src/nvptx/mod.rs
+++ b/library/stdarch/coresimd/src/nvptx/mod.rs
@@ -42,79 +42,79 @@ extern "C" {
 }
 
 /// Synchronizes all threads in the block.
-#[inline(always)]
+#[inline]
 pub unsafe fn _syncthreads() -> () {
     syncthreads()
 }
 
 /// x-th thread-block dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_dim_x() -> i32 {
     block_dim_x()
 }
 
 /// y-th thread-block dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_dim_y() -> i32 {
     block_dim_y()
 }
 
 /// z-th thread-block dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_dim_z() -> i32 {
     block_dim_z()
 }
 
 /// x-th thread-block index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_idx_x() -> i32 {
     block_idx_x()
 }
 
 /// y-th thread-block index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_idx_y() -> i32 {
     block_idx_y()
 }
 
 /// z-th thread-block index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_idx_z() -> i32 {
     block_idx_z()
 }
 
 /// x-th block-grid dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _grid_dim_x() -> i32 {
     grid_dim_x()
 }
 
 /// y-th block-grid dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _grid_dim_y() -> i32 {
     grid_dim_y()
 }
 
 /// z-th block-grid dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _grid_dim_z() -> i32 {
     grid_dim_z()
 }
 
 /// x-th thread index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _thread_idx_x() -> i32 {
     thread_idx_x()
 }
 
 /// y-th thread index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _thread_idx_y() -> i32 {
     thread_idx_y()
 }
 
 /// z-th thread index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _thread_idx_z() -> i32 {
     thread_idx_z()
 }
diff --git a/library/stdarch/coresimd/src/x86/i386/fxsr.rs b/library/stdarch/coresimd/src/x86/i386/fxsr.rs
index 28c8fb5c2a8d..b67057880a2d 100644
--- a/library/stdarch/coresimd/src/x86/i386/fxsr.rs
+++ b/library/stdarch/coresimd/src/x86/i386/fxsr.rs
@@ -21,7 +21,7 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave))]
 pub unsafe fn _fxsave(mem_addr: *mut u8) {
@@ -42,7 +42,7 @@ pub unsafe fn _fxsave(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor))]
 pub unsafe fn _fxrstor(mem_addr: *const u8) {
diff --git a/library/stdarch/coresimd/src/x86/i586/abm.rs b/library/stdarch/coresimd/src/x86/i586/abm.rs
index 8ee4659d2d01..5480b964ab99 100644
--- a/library/stdarch/coresimd/src/x86/i586/abm.rs
+++ b/library/stdarch/coresimd/src/x86/i586/abm.rs
@@ -23,7 +23,7 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
 pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
@@ -31,7 +31,7 @@ pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
 }
 
 /// Counts the bits that are set.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
 pub unsafe fn _popcnt32(x: i32) -> i32 {
diff --git a/library/stdarch/coresimd/src/x86/i586/avx.rs b/library/stdarch/coresimd/src/x86/i586/avx.rs
index c21b9a0caf7d..cba133c734cc 100644
--- a/library/stdarch/coresimd/src/x86/i586/avx.rs
+++ b/library/stdarch/coresimd/src/x86/i586/avx.rs
@@ -26,7 +26,7 @@ use x86::*;
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddpd))]
 pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -35,7 +35,7 @@ pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Add packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddps))]
 pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
@@ -45,7 +45,7 @@ pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
 /// Compute the bitwise AND of a packed double-precision (64-bit)
 /// floating-point elements
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vandpd' instuction.
 // See https://github.com/rust-lang-nursery/stdsimd/issues/71
@@ -58,7 +58,7 @@ pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandps))]
 pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
@@ -69,7 +69,7 @@ pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the bitwise OR packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vorpd' instuction.
 // See https://github.com/rust-lang-nursery/stdsimd/issues/71
@@ -82,7 +82,7 @@ pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise OR packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vorps))]
 pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
@@ -93,7 +93,7 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit
 /// lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
 pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -135,7 +135,7 @@ pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a` within
 /// 128-bit lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))]
 pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -186,7 +186,7 @@ pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
 /// elements in `a`
 /// and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vandnpd' instruction.
 #[cfg_attr(test, assert_instr(vandnps))]
@@ -199,7 +199,7 @@ pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
 /// elements in `a`
 /// and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandnps))]
 pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
@@ -210,7 +210,7 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compare packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and return packed maximum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
 pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -219,7 +219,7 @@ pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and return packed maximum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxps))]
 pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
@@ -228,7 +228,7 @@ pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compare packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and return packed minimum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminpd))]
 pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -237,7 +237,7 @@ pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and return packed minimum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminps))]
 pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
@@ -246,7 +246,7 @@ pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulpd))]
 pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -255,7 +255,7 @@ pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Add packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulps))]
 pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
@@ -264,7 +264,7 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubpd))]
 pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -273,7 +273,7 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubps))]
 pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
@@ -282,7 +282,7 @@ pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
 /// from packed elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubpd))]
 pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -291,7 +291,7 @@ pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Subtract packed single-precision (32-bit) floating-point elements in `b`
 /// from packed elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubps))]
 pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
@@ -300,7 +300,7 @@ pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the division of each of the 8 packed 32-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivps))]
 pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
@@ -309,7 +309,7 @@ pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the division of each of the 4 packed 64-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivpd))]
 pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -327,7 +327,7 @@ pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
 /// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
 /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd, b = 0x3))]
 pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
@@ -339,7 +339,7 @@ pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
 
 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// toward positive infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
 pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
@@ -348,7 +348,7 @@ pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
 
 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// toward negative infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
 pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
@@ -366,7 +366,7 @@ pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
 /// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
 /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps, b = 0x00))]
 pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
@@ -380,7 +380,7 @@ pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
 
 /// Round packed single-precision (32-bit) floating point elements in `a`
 /// toward positive infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
 pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
@@ -389,7 +389,7 @@ pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
 
 /// Round packed single-precision (32-bit) floating point elements in `a`
 /// toward negative infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
 pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
@@ -398,7 +398,7 @@ pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
 
 /// Return the square root of packed single-precision (32-bit) floating point
 /// elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
@@ -407,7 +407,7 @@ pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
 
 /// Return the square root of packed double-precision (64-bit) floating point
 /// elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
@@ -416,7 +416,7 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 
 /// Blend packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
 pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -458,7 +458,7 @@ pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -508,7 +508,7 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 
 /// Blend packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvpd))]
 pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
@@ -517,7 +517,7 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvps))]
 pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
@@ -528,7 +528,7 @@ pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// elements in `a` and `b` using the high 4 bits in `imm8`,
 /// sum the four products, and conditionally return the sum
 ///  using the low 4 bits of `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))]
 pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -542,7 +542,7 @@ pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
 /// while sums of elements from `b` are returned in odd locations.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddpd))]
 pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -554,7 +554,7 @@ pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 /// In the result, sums of elements from `a` are returned in locations of
 /// indices 0, 1, 4, 5; while sums of elements from `b` are locations
 /// 2, 3, 6, 7.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddps))]
 pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
@@ -565,7 +565,7 @@ pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
 /// while sums of elements from `b` are returned in odd locations.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubpd))]
 pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -577,7 +577,7 @@ pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 /// In the result, sums of elements from `a` are returned in locations of
 /// indices 0, 1, 4, 5; while sums of elements from `b` are locations
 /// 2, 3, 6, 7.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubps))]
 pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
@@ -586,7 +586,7 @@ pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME Should be 'vxorpd' instruction.
 #[cfg_attr(test, assert_instr(vxorps))]
@@ -598,7 +598,7 @@ pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
@@ -675,7 +675,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -688,7 +688,7 @@ pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -701,7 +701,7 @@ pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 /// Compare packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -714,7 +714,7 @@ pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Compare packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -729,7 +729,7 @@ pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// store the result in the lower element of returned vector,
 /// and copy the upper element from `a` to the upper element of returned
 /// vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
 pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -744,7 +744,7 @@ pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// store the result in the lower element of returned vector,
 /// and copy the upper 3 packed elements from `a` to the upper elements of
 /// returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
 pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -756,7 +756,7 @@ pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
 
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
 pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
@@ -765,7 +765,7 @@ pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
 
 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
 pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
@@ -774,7 +774,7 @@ pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed single-precision (32-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
 pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
@@ -783,7 +783,7 @@ pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
 pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
@@ -792,7 +792,7 @@ pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed double-precision (64-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
 pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
@@ -801,7 +801,7 @@ pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
 pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
@@ -810,7 +810,7 @@ pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
 pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
@@ -819,7 +819,7 @@ pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
 pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
@@ -828,7 +828,7 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128))]
 pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
@@ -840,7 +840,7 @@ pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
 
 /// Extract 128 bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128))]
 pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
@@ -851,7 +851,7 @@ pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
 }
 
 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128))]
 pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
@@ -864,7 +864,7 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
 }
 
 /// Zero the contents of all XMM or YMM registers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vzeroall))]
 pub unsafe fn _mm256_zeroall() {
@@ -873,7 +873,7 @@ pub unsafe fn _mm256_zeroall() {
 
 /// Zero the upper 128 bits of all YMM registers;
 /// the lower 128-bits of the registers are unmodified.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vzeroupper))]
 pub unsafe fn _mm256_zeroupper() {
@@ -882,7 +882,7 @@ pub unsafe fn _mm256_zeroupper() {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
 pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
@@ -891,7 +891,7 @@ pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
 pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
@@ -900,7 +900,7 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
@@ -952,7 +952,7 @@ pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
@@ -1005,7 +1005,7 @@ pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// within 256-bit lanes using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
 pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
@@ -1014,7 +1014,7 @@ pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
 pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
@@ -1023,7 +1023,7 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
@@ -1065,7 +1065,7 @@ pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
@@ -1091,7 +1091,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
 
 /// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) selected by `imm8` from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
 pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -1103,7 +1103,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256
 
 /// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) selected by `imm8` from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -1115,7 +1115,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m25
 
 /// Shuffle 258-bits (composed of integer data) selected by `imm8`
 /// from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 pub unsafe fn _mm256_permute2f128_si256(
@@ -1132,7 +1132,7 @@ pub unsafe fn _mm256_permute2f128_si256(
 
 /// Broadcast a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
@@ -1141,7 +1141,7 @@ pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 
 /// Broadcast a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
@@ -1150,7 +1150,7 @@ pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
 
 /// Broadcast a double-precision (64-bit) floating-point element from memory
 /// to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
@@ -1159,7 +1159,7 @@ pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
 
 /// Broadcast 128 bits from memory (composed of 4 packed single-precision
 /// (32-bit) floating-point elements) to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastf128))]
 pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
@@ -1168,7 +1168,7 @@ pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
 
 /// Broadcast 128 bits from memory (composed of 2 packed double-precision
 /// (64-bit) floating-point elements) to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastf128))]
 pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
@@ -1178,7 +1178,7 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 /// Copy `a` to result, then insert 128 bits (composed of 4 packed
 /// single-precision (32-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
@@ -1192,7 +1192,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
 /// Copy `a` to result, then insert 128 bits (composed of 2 packed
 /// double-precision (64-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d {
@@ -1204,7 +1204,7 @@ pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d
 
 /// Copy `a` to result, then insert 128 bits from `b` into result
 /// at the location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_si256(
@@ -1220,7 +1220,7 @@ pub unsafe fn _mm256_insertf128_si256(
 
 /// Copy `a` to result, and insert the 8-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
@@ -1229,7 +1229,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
 
 /// Copy `a` to result, and insert the 16-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
@@ -1238,7 +1238,7 @@ pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
 
 /// Copy `a` to result, and insert the 32-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
@@ -1249,7 +1249,7 @@ pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
 /// floating-point elements) from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
 pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
@@ -1260,7 +1260,7 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
 pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
@@ -1271,7 +1271,7 @@ pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
 /// floating-point elements) from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))]
 pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
@@ -1282,7 +1282,7 @@ pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))]
 pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
@@ -1292,7 +1292,7 @@ pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
 /// Load 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
 pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
@@ -1308,7 +1308,7 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
 /// Store 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
 pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
@@ -1318,7 +1318,7 @@ pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
 /// Load 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))]
 pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
@@ -1334,7 +1334,7 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
 /// Store 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))]
 pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
@@ -1344,7 +1344,7 @@ pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
 /// Load 256-bits of integer data from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
 pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
@@ -1354,7 +1354,7 @@ pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
 /// Store 256-bits of integer data from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
 pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
@@ -1363,7 +1363,7 @@ pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
 
 /// Load 256-bits of integer data from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
 pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
@@ -1378,7 +1378,7 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
 
 /// Store 256-bits of integer data from `a` into memory.
 /// 	`mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
 pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
@@ -1388,7 +1388,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
 /// Load packed double-precision (64-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
@@ -1397,7 +1397,7 @@ pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d
 
 /// Store packed double-precision (64-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
@@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d)
 /// Load packed double-precision (64-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
@@ -1416,7 +1416,7 @@ pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
 
 /// Store packed double-precision (64-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
@@ -1426,7 +1426,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
 /// Load packed single-precision (32-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
@@ -1435,7 +1435,7 @@ pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256
 
 /// Store packed single-precision (32-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
@@ -1445,7 +1445,7 @@ pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256)
 /// Load packed single-precision (32-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
@@ -1454,7 +1454,7 @@ pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
 
 /// Store packed single-precision (32-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
@@ -1463,7 +1463,7 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
@@ -1472,7 +1472,7 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
@@ -1481,7 +1481,7 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
 /// from "a", and return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
@@ -1491,7 +1491,7 @@ pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
 /// Load 256-bits of integer data from unaligned memory into result.
 /// This intrinsic may perform better than `_mm256_loadu_si256` when the
 /// data crosses a cache line boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vlddqu))]
 pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
@@ -1501,7 +1501,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
 /// Moves integer data from a 256-bit integer vector to a 32-byte
 /// aligned memory location. To minimize caching, the data is flagged as
 /// non-temporal (unlikely to be used again soon)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
 pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
@@ -1511,7 +1511,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
 /// Moves double-precision values from a 256-bit vector of [4 x double]
 /// to a 32-byte aligned memory location. To minimize caching, the data is
 /// flagged as non-temporal (unlikely to be used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
 pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) {
@@ -1522,7 +1522,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) {
 /// of [8 x float] to a 32-byte aligned memory location. To minimize
 /// caching, the data is flagged as non-temporal (unlikely to be used again
 /// soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))]
 pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) {
@@ -1532,7 +1532,7 @@ pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) {
 /// Compute the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`, and return the results. The maximum
 /// relative error for this approximation is less than 1.5*2^-12.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrcpps))]
 pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
@@ -1542,7 +1542,7 @@ pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
 /// Compute the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`, and return the results.
 /// The maximum relative error for this approximation is less than 1.5*2^-12.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrsqrtps))]
 pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
@@ -1551,7 +1551,7 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -1560,7 +1560,7 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
 pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
@@ -1569,7 +1569,7 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -1578,7 +1578,7 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
@@ -1589,7 +1589,7 @@ pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
@@ -1600,7 +1600,7 @@ pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
@@ -1612,7 +1612,7 @@ pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
 /// `CF` values are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
@@ -1626,7 +1626,7 @@ pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
@@ -1640,7 +1640,7 @@ pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
@@ -1655,7 +1655,7 @@ pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
@@ -1669,7 +1669,7 @@ pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
@@ -1683,7 +1683,7 @@ pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
@@ -1698,7 +1698,7 @@ pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
@@ -1712,7 +1712,7 @@ pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
@@ -1726,7 +1726,7 @@ pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
@@ -1741,7 +1741,7 @@ pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
@@ -1755,7 +1755,7 @@ pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
@@ -1769,7 +1769,7 @@ pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
@@ -1784,7 +1784,7 @@ pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
@@ -1794,7 +1794,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 /// Set each bit of the returned mask based on the most significant bit of the
 /// corresponding packed double-precision (64-bit) floating-point element in
 /// `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskpd))]
 pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
@@ -1804,7 +1804,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 /// Set each bit of the returned mask based on the most significant bit of the
 /// corresponding packed single-precision (32-bit) floating-point element in
 /// `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskps))]
 pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
@@ -1812,7 +1812,7 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
 }
 
 /// Return vector of type __m256d with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
 pub unsafe fn _mm256_setzero_pd() -> __m256d {
@@ -1820,7 +1820,7 @@ pub unsafe fn _mm256_setzero_pd() -> __m256d {
 }
 
 /// Return vector of type __m256 with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm256_setzero_ps() -> __m256 {
@@ -1828,7 +1828,7 @@ pub unsafe fn _mm256_setzero_ps() -> __m256 {
 }
 
 /// Return vector of type __m256i with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxor))]
 pub unsafe fn _mm256_setzero_si256() -> __m256i {
@@ -1837,7 +1837,7 @@ pub unsafe fn _mm256_setzero_si256() -> __m256i {
 
 /// Set packed double-precision (64-bit) floating-point elements in returned
 /// vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -1847,7 +1847,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 
 /// Set packed single-precision (32-bit) floating-point elements in returned
 /// vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_ps(
@@ -1858,7 +1858,7 @@ pub unsafe fn _mm256_set_ps(
 
 /// Set packed 8-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_epi8(
@@ -1877,7 +1877,7 @@ pub unsafe fn _mm256_set_epi8(
 }
 
 /// Set packed 16-bit integers in returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_epi16(
@@ -1895,7 +1895,7 @@ pub unsafe fn _mm256_set_epi16(
 }
 
 /// Set packed 32-bit integers in returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_epi32(
@@ -1905,7 +1905,7 @@ pub unsafe fn _mm256_set_epi32(
 }
 
 /// Set packed 64-bit integers in returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -1915,7 +1915,7 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 
 /// Set packed double-precision (64-bit) floating-point elements in returned
 /// vector with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
@@ -1924,7 +1924,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 
 /// Set packed single-precision (32-bit) floating-point elements in returned
 /// vector with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_ps(
@@ -1935,7 +1935,7 @@ pub unsafe fn _mm256_setr_ps(
 
 /// Set packed 8-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_epi8(
@@ -1955,7 +1955,7 @@ pub unsafe fn _mm256_setr_epi8(
 
 /// Set packed 16-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_epi16(
@@ -1974,7 +1974,7 @@ pub unsafe fn _mm256_setr_epi16(
 
 /// Set packed 32-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_epi32(
@@ -1985,7 +1985,7 @@ pub unsafe fn _mm256_setr_epi32(
 
 /// Set packed 64-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -1995,7 +1995,7 @@ pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 
 /// Broadcast double-precision (64-bit) floating-point value `a` to all
 /// elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
@@ -2004,7 +2004,7 @@ pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
 
 /// Broadcast single-precision (32-bit) floating-point value `a` to all
 /// elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
@@ -2013,7 +2013,7 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
 
 /// Broadcast 8-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastb`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpshufb))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -2030,7 +2030,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
 
 /// Broadcast 16-bit integer `a` to all all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(vpshufb))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -2041,7 +2041,7 @@ pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
 
 /// Broadcast 32-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastd`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
@@ -2050,7 +2050,7 @@ pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
 
 /// Broadcast 64-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastq`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(vmovddup))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -2060,7 +2060,7 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
 }
 
 /// Cast vector of type __m256d to type __m256.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2069,7 +2069,7 @@ pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
 }
 
 /// Cast vector of type __m256 to type __m256d.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2078,7 +2078,7 @@ pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
 }
 
 /// Casts vector of type __m256 to type __m256i.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2087,7 +2087,7 @@ pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
 }
 
 /// Casts vector of type __m256i to type __m256.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2096,7 +2096,7 @@ pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
 }
 
 /// Casts vector of type __m256d to type __m256i.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2105,7 +2105,7 @@ pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
 }
 
 /// Casts vector of type __m256i to type __m256d.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2114,7 +2114,7 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
 }
 
 /// Casts vector of type __m256 to type __m128.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2123,7 +2123,7 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
 }
 
 /// Casts vector of type __m256d to type __m128d.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2132,7 +2132,7 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
 }
 
 /// Casts vector of type __m256i to type __m128i.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2144,7 +2144,7 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2155,7 +2155,7 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2166,7 +2166,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 
 /// Casts vector of type __m128i to type __m256i;
 /// the upper 128 bits of the result are undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2180,7 +2180,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 /// Constructs a 256-bit floating-point vector of [8 x float] from a
 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
 /// the value of the source vector. The upper 128 bits are set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2191,7 +2191,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
 /// The lower 128 bits contain the value of the source vector. The upper
 /// 128 bits are set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2205,7 +2205,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits
 /// contain the value of the source vector. The upper 128 bits are set
 /// to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2214,7 +2214,7 @@ pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
 }
 
 /// Return vector of type `__m256` with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_undefined_ps() -> __m256 {
@@ -2222,7 +2222,7 @@ pub unsafe fn _mm256_undefined_ps() -> __m256 {
 }
 
 /// Return vector of type `__m256d` with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_undefined_pd() -> __m256d {
@@ -2230,7 +2230,7 @@ pub unsafe fn _mm256_undefined_pd() -> __m256d {
 }
 
 /// Return vector of type __m256i with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_undefined_si256() -> __m256i {
@@ -2238,7 +2238,7 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i {
 }
 
 /// Set packed __m256 returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
@@ -2246,7 +2246,7 @@ pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
 }
 
 /// Set packed __m256d returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
@@ -2256,7 +2256,7 @@ pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
 }
 
 /// Set packed __m256i returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
@@ -2266,7 +2266,7 @@ pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
 }
 
 /// Set packed __m256 returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
@@ -2274,7 +2274,7 @@ pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
 }
 
 /// Set packed __m256d returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
@@ -2282,7 +2282,7 @@ pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
 }
 
 /// Set packed __m256i returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
@@ -2293,7 +2293,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 /// floating-point elements) from memory, and combine them into a 256-bit
 /// value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_loadu2_m128(
@@ -2307,7 +2307,7 @@ pub unsafe fn _mm256_loadu2_m128(
 /// floating-point elements) from memory, and combine them into a 256-bit
 /// value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_loadu2_m128d(
@@ -2320,7 +2320,7 @@ pub unsafe fn _mm256_loadu2_m128d(
 /// Load two 128-bit values (composed of integer data) from memory, and combine
 /// them into a 256-bit value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_loadu2_m128i(
@@ -2335,7 +2335,7 @@ pub unsafe fn _mm256_loadu2_m128i(
 /// single-precision (32-bit) floating-point elements) from `a` into memory two
 /// different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_storeu2_m128(
@@ -2351,7 +2351,7 @@ pub unsafe fn _mm256_storeu2_m128(
 /// double-precision (64-bit) floating-point elements) from `a` into memory two
 /// different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_storeu2_m128d(
@@ -2366,7 +2366,7 @@ pub unsafe fn _mm256_storeu2_m128d(
 /// Store the high and low 128-bit halves (each composed of integer data) from
 /// `a` into memory two different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_storeu2_m128i(
@@ -2380,7 +2380,7 @@ pub unsafe fn _mm256_storeu2_m128i(
 }
 
 /// Returns the first element of the input vector of [8 x float].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(movss))] FIXME
 pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
diff --git a/library/stdarch/coresimd/src/x86/i586/avx2.rs b/library/stdarch/coresimd/src/x86/i586/avx2.rs
index 72892913bbfa..540031009eae 100644
--- a/library/stdarch/coresimd/src/x86/i586/avx2.rs
+++ b/library/stdarch/coresimd/src/x86/i586/avx2.rs
@@ -31,7 +31,7 @@ use x86::*;
 use stdsimd_test::assert_instr;
 
 /// Computes the absolute values of packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsd))]
 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
@@ -39,7 +39,7 @@ pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
 }
 
 /// Computes the absolute values of packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsw))]
 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
@@ -47,7 +47,7 @@ pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
 }
 
 /// Computes the absolute values of packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsb))]
 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
@@ -55,7 +55,7 @@ pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
 }
 
 /// Add packed 64-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddq))]
 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -63,7 +63,7 @@ pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddd))]
 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -71,7 +71,7 @@ pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddw))]
 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -79,7 +79,7 @@ pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddb))]
 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -87,7 +87,7 @@ pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -95,7 +95,7 @@ pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -103,7 +103,7 @@ pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -111,7 +111,7 @@ pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -120,7 +120,7 @@ pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 /// result, shift the result right by `n` bytes, and return the low 16 bytes.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpalignr, n = 15))]
 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
@@ -187,7 +187,7 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
 
 /// Compute the bitwise AND of 256 bits (representing integer data)
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vandps))]
 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -196,7 +196,7 @@ pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vandnps))]
 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -205,7 +205,7 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Average packed unsigned 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpavgw))]
 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -213,7 +213,7 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Average packed unsigned 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpavgb))]
 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -221,7 +221,7 @@ pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))]
 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
@@ -253,7 +253,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))]
 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
@@ -305,7 +305,7 @@ pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
@@ -359,7 +359,7 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendvb))]
 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
@@ -368,7 +368,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
 /// the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
@@ -379,7 +379,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
@@ -392,7 +392,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 // often compiled to vbroadcastss.
 /// Broadcast the low packed 32-bit integer from `a` to all elements of
 /// the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
@@ -405,7 +405,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 // often compiled to vbroadcastss.
 /// Broadcast the low packed 32-bit integer from `a` to all elements of
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
@@ -416,7 +416,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
 /// the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
@@ -429,7 +429,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 // often compiled to vbroadcastsd.
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
@@ -440,7 +440,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 
 /// Broadcast the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
@@ -449,7 +449,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 
 /// Broadcast the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
@@ -460,7 +460,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 // vbroadcastf128.
 /// Broadcast 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
@@ -470,7 +470,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 
 /// Broadcast the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
@@ -479,7 +479,7 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 
 /// Broadcast the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
@@ -488,7 +488,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
@@ -499,7 +499,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
 /// the 256-bit returned value
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
@@ -509,7 +509,7 @@ pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqq))]
 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -517,7 +517,7 @@ pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqd))]
 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -525,7 +525,7 @@ pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqw))]
 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -533,7 +533,7 @@ pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqb))]
 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -541,7 +541,7 @@ pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtq))]
 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -549,7 +549,7 @@ pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtd))]
 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -557,7 +557,7 @@ pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtw))]
 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -565,7 +565,7 @@ pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtb))]
 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -573,7 +573,7 @@ pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Sign-extend 16-bit integers to 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
@@ -581,7 +581,7 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 16-bit integers to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
@@ -591,7 +591,7 @@ pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 32-bit integers to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
@@ -599,7 +599,7 @@ pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 16-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
@@ -607,7 +607,7 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
@@ -617,7 +617,7 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
@@ -628,7 +628,7 @@ pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 
 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit
 /// integers, and store the results in dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
@@ -637,7 +637,7 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 /// integers. The upper four elements of `a` are unused.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
@@ -647,7 +647,7 @@ pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 }
 
 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
@@ -655,7 +655,7 @@ pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 }
 
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
@@ -664,7 +664,7 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 /// integers. The upper eight elements of `a` are unused.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
@@ -675,7 +675,7 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 /// integers. The upper twelve elements of `a` are unused.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
@@ -685,7 +685,7 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 }
 
 /// Extract 128 bits (of integer data) from `a` selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
@@ -699,7 +699,7 @@ pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
 }
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddw))]
 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -707,7 +707,7 @@ pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddd))]
 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -716,7 +716,7 @@ pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddsw))]
 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -724,7 +724,7 @@ pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubw))]
 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -732,7 +732,7 @@ pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubd))]
 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -741,7 +741,7 @@ pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubsw))]
 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -751,7 +751,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm_i32gather_epi32(
@@ -772,7 +772,7 @@ pub unsafe fn _mm_i32gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_epi32(
@@ -792,7 +792,7 @@ pub unsafe fn _mm_mask_i32gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm256_i32gather_epi32(
@@ -813,7 +813,7 @@ pub unsafe fn _mm256_i32gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_epi32(
@@ -833,7 +833,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm_i32gather_ps(
@@ -853,7 +853,7 @@ pub unsafe fn _mm_i32gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_ps(
@@ -870,7 +870,7 @@ pub unsafe fn _mm_mask_i32gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm256_i32gather_ps(
@@ -890,7 +890,7 @@ pub unsafe fn _mm256_i32gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_ps(
@@ -907,7 +907,7 @@ pub unsafe fn _mm256_mask_i32gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm_i32gather_epi64(
@@ -928,7 +928,7 @@ pub unsafe fn _mm_i32gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_epi64(
@@ -948,7 +948,7 @@ pub unsafe fn _mm_mask_i32gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm256_i32gather_epi64(
@@ -969,7 +969,7 @@ pub unsafe fn _mm256_i32gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_epi64(
@@ -989,7 +989,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm_i32gather_pd(
@@ -1009,7 +1009,7 @@ pub unsafe fn _mm_i32gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_pd(
@@ -1026,7 +1026,7 @@ pub unsafe fn _mm_mask_i32gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm256_i32gather_pd(
@@ -1046,7 +1046,7 @@ pub unsafe fn _mm256_i32gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_pd(
@@ -1063,7 +1063,7 @@ pub unsafe fn _mm256_mask_i32gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm_i64gather_epi32(
@@ -1084,7 +1084,7 @@ pub unsafe fn _mm_i64gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_epi32(
@@ -1104,7 +1104,7 @@ pub unsafe fn _mm_mask_i64gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm256_i64gather_epi32(
@@ -1125,7 +1125,7 @@ pub unsafe fn _mm256_i64gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_epi32(
@@ -1145,7 +1145,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm_i64gather_ps(
@@ -1165,7 +1165,7 @@ pub unsafe fn _mm_i64gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_ps(
@@ -1182,7 +1182,7 @@ pub unsafe fn _mm_mask_i64gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm256_i64gather_ps(
@@ -1202,7 +1202,7 @@ pub unsafe fn _mm256_i64gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_ps(
@@ -1219,7 +1219,7 @@ pub unsafe fn _mm256_mask_i64gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm_i64gather_epi64(
@@ -1240,7 +1240,7 @@ pub unsafe fn _mm_i64gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_epi64(
@@ -1260,7 +1260,7 @@ pub unsafe fn _mm_mask_i64gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm256_i64gather_epi64(
@@ -1281,7 +1281,7 @@ pub unsafe fn _mm256_i64gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_epi64(
@@ -1301,7 +1301,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm_i64gather_pd(
@@ -1321,7 +1321,7 @@ pub unsafe fn _mm_i64gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_pd(
@@ -1338,7 +1338,7 @@ pub unsafe fn _mm_mask_i64gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm256_i64gather_pd(
@@ -1358,7 +1358,7 @@ pub unsafe fn _mm256_i64gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_pd(
@@ -1374,7 +1374,7 @@ pub unsafe fn _mm256_mask_i64gather_pd(
 
 /// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_inserti128_si256(
@@ -1392,7 +1392,7 @@ pub unsafe fn _mm256_inserti128_si256(
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
 /// of intermediate 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1403,7 +1403,7 @@ pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// corresponding signed 8-bit integer from `b`, producing intermediate
 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
 /// signed 16-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1413,7 +1413,7 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
@@ -1423,7 +1423,7 @@ pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm256_maskload_epi32(
@@ -1435,7 +1435,7 @@ pub unsafe fn _mm256_maskload_epi32(
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
@@ -1445,7 +1445,7 @@ pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm256_maskload_epi64(
@@ -1457,7 +1457,7 @@ pub unsafe fn _mm256_maskload_epi64(
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
@@ -1467,7 +1467,7 @@ pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i)
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm256_maskstore_epi32(
@@ -1479,7 +1479,7 @@ pub unsafe fn _mm256_maskstore_epi32(
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
@@ -1489,7 +1489,7 @@ pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i)
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm256_maskstore_epi64(
@@ -1500,7 +1500,7 @@ pub unsafe fn _mm256_maskstore_epi64(
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1509,7 +1509,7 @@ pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1518,7 +1518,7 @@ pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -1527,7 +1527,7 @@ pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
 /// the packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -1536,7 +1536,7 @@ pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
 /// the packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
@@ -1545,7 +1545,7 @@ pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
 /// the packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -1554,7 +1554,7 @@ pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsw))]
 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1563,7 +1563,7 @@ pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsd))]
 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1572,7 +1572,7 @@ pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsb))]
 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -1581,7 +1581,7 @@ pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
 /// the packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminuw))]
 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -1590,7 +1590,7 @@ pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
 /// the packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminud))]
 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
@@ -1599,7 +1599,7 @@ pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
 /// the packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminub))]
 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -1608,7 +1608,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Create mask from the most significant bit of each 8-bit element in `a`,
 /// return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovmskb))]
 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
@@ -1622,7 +1622,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// selected from `b` starting at on the offset specified in `imm8`. Eight
 /// quadruplets are formed from sequential 8-bit integers selected from `a`
 /// starting at the offset specified in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
@@ -1639,7 +1639,7 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i
 /// `a` and `b`
 ///
 /// Return the 64-bit results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1650,7 +1650,7 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// element in `a` and `b`
 ///
 /// Return the unsigned 64-bit results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
@@ -1660,7 +1660,7 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1670,7 +1670,7 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -1680,7 +1680,7 @@ pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and return the low 16 bits of the
 /// intermediate integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmullw))]
 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1690,7 +1690,7 @@ pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and return the low 16 bits of the
 /// intermediate integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulld))]
 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1701,7 +1701,7 @@ pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits [16:1]
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1710,7 +1710,7 @@ pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vorps))]
 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -1719,7 +1719,7 @@ pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1728,7 +1728,7 @@ pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1737,7 +1737,7 @@ pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1746,7 +1746,7 @@ pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1757,7 +1757,7 @@ pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
 ///
 /// The last 3 bits of each integer of `b` are used as addresses into the 8
 /// integers of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermd))]
 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1765,7 +1765,7 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermq, imm8 = 9))]
 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
@@ -1817,7 +1817,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
 pub unsafe fn _mm256_permute2x128_si256(
@@ -1835,7 +1835,7 @@ pub unsafe fn _mm256_permute2x128_si256(
 
 /// Shuffle 64-bit floating-point elements in `a` across lanes using the
 /// control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
@@ -1887,7 +1887,7 @@ pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle eight 32-bit foating-point elements in `a` across lanes using
 /// the corresponding 32-bit integer index in `idx`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermps))]
 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
@@ -1898,7 +1898,7 @@ pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
 /// and `b`, then horizontally sum each consecutive 8 differences to
 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
 /// integers in the low 16 bits of the 64-bit return value
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsadbw))]
 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -1934,7 +1934,7 @@ pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 ///     r
 /// }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufb))]
 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -1974,7 +1974,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2035,7 +2035,7 @@ pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2092,7 +2092,7 @@ pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2149,7 +2149,7 @@ pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
 /// Negate packed 16-bit integers in `a` when the corresponding signed
 /// 16-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignw))]
 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2159,7 +2159,7 @@ pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Negate packed 32-bit integers in `a` when the corresponding signed
 /// 32-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignd))]
 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2169,7 +2169,7 @@ pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// Negate packed 8-bit integers in `a` when the corresponding signed
 /// 8-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignb))]
 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2178,7 +2178,7 @@ pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllw))]
 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
@@ -2187,7 +2187,7 @@ pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslld))]
 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
@@ -2196,7 +2196,7 @@ pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllq))]
 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
@@ -2205,7 +2205,7 @@ pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllw))]
 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2214,7 +2214,7 @@ pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslld))]
 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2223,7 +2223,7 @@ pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllq))]
 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
@@ -2231,7 +2231,7 @@ pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
@@ -2245,7 +2245,7 @@ pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
@@ -2255,7 +2255,7 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
 /// Shift packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -2265,7 +2265,7 @@ pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// Shift packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
@@ -2275,7 +2275,7 @@ pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// Shift packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -2285,7 +2285,7 @@ pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// Shift packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
@@ -2294,7 +2294,7 @@ pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsraw))]
 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
@@ -2303,7 +2303,7 @@ pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrad))]
 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
@@ -2312,7 +2312,7 @@ pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsraw))]
 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2321,7 +2321,7 @@ pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrad))]
 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2330,7 +2330,7 @@ pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsravd))]
 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -2339,7 +2339,7 @@ pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsravd))]
 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
@@ -2347,7 +2347,7 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
@@ -2361,7 +2361,7 @@ pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
@@ -2370,7 +2370,7 @@ pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
@@ -2379,7 +2379,7 @@ pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrld))]
 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
@@ -2388,7 +2388,7 @@ pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
@@ -2397,7 +2397,7 @@ pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2406,7 +2406,7 @@ pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrld))]
 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2415,7 +2415,7 @@ pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
@@ -2424,7 +2424,7 @@ pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -2433,7 +2433,7 @@ pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
@@ -2442,7 +2442,7 @@ pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -2451,7 +2451,7 @@ pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
@@ -2461,7 +2461,7 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubw))]
 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2469,7 +2469,7 @@ pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubd))]
 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2477,7 +2477,7 @@ pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubq))]
 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -2485,7 +2485,7 @@ pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubb))]
 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2494,7 +2494,7 @@ pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
 /// `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2503,7 +2503,7 @@ pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
 /// `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2512,7 +2512,7 @@ pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -2521,7 +2521,7 @@ pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -2560,7 +2560,7 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2605,7 +2605,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2648,7 +2648,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2689,7 +2689,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2729,7 +2729,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2765,7 +2765,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2801,7 +2801,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -2837,7 +2837,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -2847,7 +2847,7 @@ pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -2858,7 +2858,7 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
@@ -2870,7 +2870,7 @@ pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
@@ -2879,7 +2879,7 @@ pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
 }
 
 /// Extract a 32-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
@@ -2888,7 +2888,7 @@ pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
 }
 
 /// Returns the first element of the input vector of [4 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 //#[cfg_attr(test, assert_instr(movsd))] FIXME
 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
@@ -2896,7 +2896,7 @@ pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 }
 
 /// Returns the first element of the input vector of [8 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 //#[cfg_attr(test, assert_instr(movd))] FIXME
 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
diff --git a/library/stdarch/coresimd/src/x86/i586/bmi.rs b/library/stdarch/coresimd/src/x86/i586/bmi.rs
index 9b3eee2aa02e..512695e049cf 100644
--- a/library/stdarch/coresimd/src/x86/i586/bmi.rs
+++ b/library/stdarch/coresimd/src/x86/i586/bmi.rs
@@ -14,7 +14,7 @@ use stdsimd_test::assert_instr;
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
@@ -26,7 +26,7 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
@@ -34,7 +34,7 @@ pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
 }
 
 /// Bitwise logical `AND` of inverted `a` with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(andn))]
 pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
@@ -42,7 +42,7 @@ pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
 }
 
 /// Extract lowest set isolated bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsi))]
 pub unsafe fn _blsi_u32(x: u32) -> u32 {
@@ -50,7 +50,7 @@ pub unsafe fn _blsi_u32(x: u32) -> u32 {
 }
 
 /// Get mask up to lowest set bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsmsk))]
 pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
@@ -60,7 +60,7 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsr))]
 pub unsafe fn _blsr_u32(x: u32) -> u32 {
@@ -70,7 +70,7 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
@@ -80,7 +80,7 @@ pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
diff --git a/library/stdarch/coresimd/src/x86/i586/bmi2.rs b/library/stdarch/coresimd/src/x86/i586/bmi2.rs
index adc963e0f534..e4d393f99064 100644
--- a/library/stdarch/coresimd/src/x86/i586/bmi2.rs
+++ b/library/stdarch/coresimd/src/x86/i586/bmi2.rs
@@ -17,7 +17,7 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
-#[inline(always)]
+#[inline]
 // LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
@@ -29,7 +29,7 @@ pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
 }
 
 /// Zero higher bits of `a` >= `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
 pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
@@ -38,7 +38,7 @@ pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
 pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
@@ -47,7 +47,7 @@ pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
 
 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
 pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
diff --git a/library/stdarch/coresimd/src/x86/i586/bswap.rs b/library/stdarch/coresimd/src/x86/i586/bswap.rs
index 8bac16756922..92f1634bf506 100644
--- a/library/stdarch/coresimd/src/x86/i586/bswap.rs
+++ b/library/stdarch/coresimd/src/x86/i586/bswap.rs
@@ -6,14 +6,14 @@
 use stdsimd_test::assert_instr;
 
 /// Return an integer with the reversed byte order of x
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(bswap))]
 pub unsafe fn _bswap(x: i32) -> i32 {
     bswap_i32(x)
 }
 
 /// Return an integer with the reversed byte order of x
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(bswap))]
 pub unsafe fn _bswap64(x: i64) -> i64 {
     bswap_i64(x)
diff --git a/library/stdarch/coresimd/src/x86/i586/cpuid.rs b/library/stdarch/coresimd/src/x86/i586/cpuid.rs
index 2480eb58e065..eeb7ac3681ea 100644
--- a/library/stdarch/coresimd/src/x86/i586/cpuid.rs
+++ b/library/stdarch/coresimd/src/x86/i586/cpuid.rs
@@ -42,7 +42,7 @@ pub struct CpuidResult {
 /// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
 /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
 pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
     let mut r = ::core::mem::uninitialized::<CpuidResult>();
@@ -62,14 +62,14 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
 }
 
 /// See [`__cpuid_count`](fn.__cpuid_count.html).
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
 pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
     __cpuid_count(leaf, 0)
 }
 
 /// Does the host support the `cpuid` instruction?
-#[inline(always)]
+#[inline]
 pub fn has_cpuid() -> bool {
     #[cfg(target_arch = "x86_64")]
     {
@@ -111,7 +111,7 @@ pub fn has_cpuid() -> bool {
 ///
 /// See also [`__cpuid`](fn.__cpuid.html) and
 /// [`__cpuid_count`](fn.__cpuid_count.html).
-#[inline(always)]
+#[inline]
 pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
     let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
     (eax, ebx)
diff --git a/library/stdarch/coresimd/src/x86/i586/rdtsc.rs b/library/stdarch/coresimd/src/x86/i586/rdtsc.rs
index f9929aaa6b0f..9649562cdcf8 100644
--- a/library/stdarch/coresimd/src/x86/i586/rdtsc.rs
+++ b/library/stdarch/coresimd/src/x86/i586/rdtsc.rs
@@ -15,7 +15,7 @@ use stdsimd_test::assert_instr;
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX and RDX are cleared.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rdtsc))]
 pub unsafe fn _rdtsc() -> u64 {
     rdtsc()
@@ -35,7 +35,7 @@ pub unsafe fn _rdtsc() -> u64 {
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rdtscp))]
 pub unsafe fn _rdtscp(aux: *mut u32) -> u64 {
     rdtscp(aux as *mut _)
diff --git a/library/stdarch/coresimd/src/x86/i586/sse.rs b/library/stdarch/coresimd/src/x86/i586/sse.rs
index 8911429a486f..57b3f42a2406 100644
--- a/library/stdarch/coresimd/src/x86/i586/sse.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse.rs
@@ -13,7 +13,7 @@ use stdsimd_test::assert_instr;
 
 /// Adds the first component of `a` and `b`, the other components are copied
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(addss))]
 pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
@@ -21,7 +21,7 @@ pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Adds __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(addps))]
 pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
@@ -30,7 +30,7 @@ pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Subtracts the first component of `b` from `a`, the other components are
 /// copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(subss))]
 pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
@@ -38,7 +38,7 @@ pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Subtracts __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(subps))]
 pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
@@ -47,7 +47,7 @@ pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Multiplies the first component of `a` and `b`, the other components are
 /// copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(mulss))]
 pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
@@ -55,7 +55,7 @@ pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Multiplies __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(mulps))]
 pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
@@ -64,7 +64,7 @@ pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Divides the first component of `b` by `a`, the other components are
 /// copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(divss))]
 pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
@@ -72,7 +72,7 @@ pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Divides __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(divps))]
 pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
@@ -81,7 +81,7 @@ pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Return the square root of the first single-precision (32-bit)
 /// floating-point element in `a`, the other elements are unchanged.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sqrtss))]
 pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
@@ -90,7 +90,7 @@ pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
 
 /// Return the square root of packed single-precision (32-bit) floating-point
 /// elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sqrtps))]
 pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
@@ -99,7 +99,7 @@ pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal of the first single-precision
 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rcpss))]
 pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
@@ -108,7 +108,7 @@ pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rcpps))]
 pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
@@ -117,7 +117,7 @@ pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal square root of the fist single-precision
 /// (32-bit) floating-point elements in `a`, the other elements are unchanged.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rsqrtss))]
 pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
@@ -126,7 +126,7 @@ pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rsqrtps))]
 pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
@@ -136,7 +136,7 @@ pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
 /// Compare the first single-precision (32-bit) floating-point element of `a`
 /// and `b`, and return the minimum value in the first element of the return
 /// value, the other elements are copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(minss))]
 pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
@@ -145,7 +145,7 @@ pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(minps))]
 pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
@@ -155,7 +155,7 @@ pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare the first single-precision (32-bit) floating-point element of `a`
 /// and `b`, and return the maximum value in the first element of the return
 /// value, the other elements are copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(maxss))]
 pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
@@ -164,7 +164,7 @@ pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(maxps))]
 pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
@@ -172,7 +172,7 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Bitwise AND of packed single-precision (32-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `and` instructions, so ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
@@ -187,7 +187,7 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
 /// elements.
 ///
 /// Computes `!a & b` for each bit in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `not` and `and` instructions, so ignore
 // it.
@@ -201,7 +201,7 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Bitwise OR of packed single-precision (32-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `or` instructions, so we ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
@@ -214,7 +214,7 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
 /// elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `xor` instructions, so we ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
@@ -228,7 +228,7 @@ pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare the lowest `f32` of both inputs for equality. The lowest 32 bits of
 /// the result will be `0xffffffff` if the two inputs are equal, or `0`
 /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpeqss))]
 pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
@@ -239,7 +239,7 @@ pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if `a.extract(0)` is less than
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltss))]
 pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
@@ -250,7 +250,7 @@ pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
 /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
 /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpless))]
 pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
@@ -261,7 +261,7 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltss))]
 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
@@ -272,7 +272,7 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
 /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
 /// of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpless))]
 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
@@ -283,7 +283,7 @@ pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpneqss))]
 pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
@@ -294,7 +294,7 @@ pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltss))]
 pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
@@ -305,7 +305,7 @@ pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
 /// of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnless))]
 pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
@@ -316,7 +316,7 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
 /// the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltss))]
 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
@@ -327,7 +327,7 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
 /// bits of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnless))]
 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
@@ -338,7 +338,7 @@ pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
 /// the result will be `0xffffffff` if neither of `a.extract(0)` or
 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpordss))]
 pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
@@ -349,7 +349,7 @@ pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if any of `a.extract(0)` or
 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpunordss))]
 pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
@@ -359,7 +359,7 @@ pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input elements
 /// were equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpeqps))]
 pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
@@ -369,7 +369,7 @@ pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is less than the corresponding element in `b`, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltps))]
 pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
@@ -380,7 +380,7 @@ pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is less than or equal to the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpleps))]
 pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
@@ -390,7 +390,7 @@ pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltps))]
 pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
@@ -401,7 +401,7 @@ pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is greater than or equal to the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpleps))]
 pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
@@ -411,7 +411,7 @@ pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input elements
 /// are *not* equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpneqps))]
 pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
@@ -422,7 +422,7 @@ pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* less than the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltps))]
 pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
@@ -433,7 +433,7 @@ pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* less than or equal to the corresponding element in `b`, or
 /// `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnleps))]
 pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
@@ -444,7 +444,7 @@ pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* greater than the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltps))]
 pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
@@ -455,7 +455,7 @@ pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* greater than or equal to the corresponding element in `b`,
 /// or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnleps))]
 pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
@@ -466,7 +466,7 @@ pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
 /// Returns four floats that have one of two possible bit patterns. The element
 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpordps))]
 pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
@@ -477,7 +477,7 @@ pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
 /// Returns four floats that have one of two possible bit patterns. The element
 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpunordps))]
 pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
@@ -486,7 +486,7 @@ pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
@@ -495,7 +495,7 @@ pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
@@ -505,7 +505,7 @@ pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
@@ -515,7 +515,7 @@ pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
@@ -525,7 +525,7 @@ pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 /// `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
@@ -534,7 +534,7 @@ pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are *not* equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
@@ -544,7 +544,7 @@ pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are equal, or `0` otherwise. This instruction will not signal
 /// an exception if either argument is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
@@ -555,7 +555,7 @@ pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
 /// This instruction will not signal an exception if either argument is a quiet
 /// NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
@@ -566,7 +566,7 @@ pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 /// otherwise. This instruction will not signal an exception if either argument
 /// is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
@@ -577,7 +577,7 @@ pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 /// otherwise. This instruction will not signal an exception if either argument
 /// is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
@@ -588,7 +588,7 @@ pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 /// `0` otherwise. This instruction will not signal an exception if either
 /// argument is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
@@ -598,7 +598,7 @@ pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are *not* equal, or `0` otherwise. This instruction will not
 /// signal an exception if either argument is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
@@ -613,7 +613,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
 /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
 pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
@@ -621,7 +621,7 @@ pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
 }
 
 /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
 pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
@@ -638,7 +638,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
 /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
 pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
@@ -646,7 +646,7 @@ pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
 }
 
 /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
 pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
@@ -654,7 +654,7 @@ pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
 }
 
 /// Extract the lowest 32 bit float from the input vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // No point in using assert_instrs. In Unix x86_64 calling convention this is a
 // no-op, and on Windows it's just a `mov`.
@@ -667,7 +667,7 @@ pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
 /// input).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
@@ -675,7 +675,7 @@ pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
 }
 
 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
@@ -684,7 +684,7 @@ pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
 
 /// Construct a `__m128` with the lowest element set to `a` and the rest set to
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
@@ -692,7 +692,7 @@ pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
 }
 
 /// Construct a `__m128` with all element set to `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps))]
 pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
@@ -700,7 +700,7 @@ pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
 }
 
 /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps))]
 pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
@@ -724,7 +724,7 @@ pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
 /// ```text
 /// let v = _mm_set_ps(d, c, b, a);
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpcklps))]
 pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
@@ -739,7 +739,7 @@ pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 /// ```text
 /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))]
 // On a 32-bit architecture it just copies the operands from the stack.
@@ -749,7 +749,7 @@ pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 }
 
 /// Construct a `__m128` with all elements initialized to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_setzero_ps() -> __m128 {
@@ -761,7 +761,7 @@ pub unsafe fn _mm_setzero_ps() -> __m128 {
 ///
 /// The lower half of result takes values from `a` and the higher half from
 /// `b`. Mask is split to 2 control bits each to index the element from inputs.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps, mask = 3))]
 pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
@@ -812,7 +812,7 @@ pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the higher half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpckhps))]
 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
@@ -821,7 +821,7 @@ pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the lower half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpcklps))]
 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
@@ -830,7 +830,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
 /// lower half of result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
 #[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
@@ -841,7 +841,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
 /// higher half of result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(unpcklpd))]
 #[cfg_attr(all(test, not(target_feature = "sse2")), assert_instr(movlhps))]
@@ -853,7 +853,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 ///
 /// The mask is stored in the 4 least significant bits of the return value.
 /// All other bits are set to `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movmskps))]
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
@@ -892,7 +892,7 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // TODO: generates MOVHPD if the CPU supports SSE2.
 // #[cfg_attr(test, assert_instr(movhps))]
@@ -943,7 +943,7 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // TODO: generates MOVLPD if the CPU supports SSE2.
 // #[cfg_attr(test, assert_instr(movlps))]
@@ -966,7 +966,7 @@ pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
 /// elements set to zero.
 ///
 /// This corresponds to instructions `VMOVSS` / `MOVSS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
@@ -978,7 +978,7 @@ pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
 ///
 /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
 /// shuffling.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
@@ -987,7 +987,7 @@ pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
 }
 
 /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
@@ -1002,7 +1002,7 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
 /// memory.
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
@@ -1016,7 +1016,7 @@ pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
 /// may be faster.
 ///
 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
@@ -1049,7 +1049,7 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
 /// shuffling.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
@@ -1061,7 +1061,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
 ///
 /// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may
 /// choose to generate an equivalent sequence of other instructions.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's
 // fine.
@@ -1091,7 +1091,7 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
 ///
 /// This intrinsic corresponds to the `MOVQ` instruction. The compiler may
 /// choose to generate an equivalent sequence of other instructions.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // On i586 the codegen just generates plane MOVs. No need to test for that.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
@@ -1121,7 +1121,7 @@ pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
 /// Store the lowest 32 bit float of `a` into memory.
 ///
 /// This intrinsic corresponds to the `MOVSS` instruction.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
@@ -1144,7 +1144,7 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
 /// *p.offset(2) = x;
 /// *p.offset(3) = x;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
@@ -1153,7 +1153,7 @@ pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
 }
 
 /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
@@ -1169,7 +1169,7 @@ pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
 /// memory.
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
@@ -1181,7 +1181,7 @@ pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
 /// faster.
 ///
 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
@@ -1206,7 +1206,7 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
 /// *p.offset(2) = a.extract(1);
 /// *p.offset(3) = a.extract(0);
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
@@ -1221,7 +1221,7 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
 /// ```text
 /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
@@ -1234,7 +1234,7 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
 /// Guarantees that every store instruction that precedes, in program order, is
 /// globally visible before any store instruction which follows the fence in
 /// program order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sfence))]
 pub unsafe fn _mm_sfence() {
@@ -1244,7 +1244,7 @@ pub unsafe fn _mm_sfence() {
 /// Get the unsigned 32-bit value of the MXCSR control and status register.
 ///
 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(stmxcsr))]
 pub unsafe fn _mm_getcsr() -> u32 {
@@ -1378,7 +1378,7 @@ pub unsafe fn _mm_getcsr() -> u32 {
 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);  // turn on
 /// ```
 ///
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ldmxcsr))]
 pub unsafe fn _mm_setcsr(val: u32) {
@@ -1435,7 +1435,7 @@ pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
 pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
@@ -1443,7 +1443,7 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
@@ -1451,7 +1451,7 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
@@ -1459,7 +1459,7 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
@@ -1467,7 +1467,7 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
@@ -1475,7 +1475,7 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
@@ -1483,7 +1483,7 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
@@ -1493,7 +1493,7 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
@@ -1548,7 +1548,7 @@ pub const _MM_HINT_NTA: i8 = 0;
 /// * Prefetching may also fail if there are not enough memory-subsystem
 ///   resources (e.g., request buffers).
 ///
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))]
 #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))]
@@ -1573,7 +1573,7 @@ pub unsafe fn _mm_prefetch(p: *const u8, strategy: i8) {
 }
 
 /// Return vector of type __m128 with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 pub unsafe fn _mm_undefined_ps() -> __m128 {
     __m128(
@@ -1585,7 +1585,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 {
 }
 
 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_TRANSPOSE4_PS(
@@ -1684,7 +1684,7 @@ extern "C" {
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception _may_ be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movntps))]
 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
@@ -1693,7 +1693,7 @@ pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
 
 /// Store 64-bits of integer data from a into memory using a non-temporal
 /// memory hint.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse,mmx")]
 #[cfg_attr(test, assert_instr(movntq))]
 pub unsafe fn _mm_stream_pi(mem_addr: *mut __m64, a: __m64) {
diff --git a/library/stdarch/coresimd/src/x86/i586/sse2.rs b/library/stdarch/coresimd/src/x86/i586/sse2.rs
index c0555679bfe4..ab4a574e2d71 100644
--- a/library/stdarch/coresimd/src/x86/i586/sse2.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse2.rs
@@ -16,7 +16,7 @@ use x86::*;
 ///
 /// This can help improve the performance and power consumption of spin-wait
 /// loops.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pause))]
 pub unsafe fn _mm_pause() {
@@ -25,7 +25,7 @@ pub unsafe fn _mm_pause() {
 
 /// Invalidate and flush the cache line that contains `p` from all levels of
 /// the cache hierarchy.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(clflush))]
 pub unsafe fn _mm_clflush(p: *mut u8) {
@@ -38,7 +38,7 @@ pub unsafe fn _mm_clflush(p: *mut u8) {
 /// Guarantees that every load instruction that precedes, in program order, is
 /// globally visible before any load instruction which follows the fence in
 /// program order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(lfence))]
 pub unsafe fn _mm_lfence() {
@@ -51,7 +51,7 @@ pub unsafe fn _mm_lfence() {
 /// Guarantees that every memory access that precedes, in program order, the
 /// memory fence instruction is globally visible before any memory instruction
 /// which follows the fence in program order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mfence))]
 pub unsafe fn _mm_mfence() {
@@ -59,7 +59,7 @@ pub unsafe fn _mm_mfence() {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddb))]
 pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -67,7 +67,7 @@ pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddw))]
 pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -75,7 +75,7 @@ pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddd))]
 pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -83,7 +83,7 @@ pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 64-bit integers in `a` and "b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddq))]
 pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -91,7 +91,7 @@ pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddsb))]
 pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -99,7 +99,7 @@ pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddsw))]
 pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -107,7 +107,7 @@ pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddusb))]
 pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -115,7 +115,7 @@ pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddusw))]
 pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -123,7 +123,7 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Average packed unsigned 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pavgb))]
 pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -131,7 +131,7 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Average packed unsigned 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pavgw))]
 pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -143,7 +143,7 @@ pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
 /// intermediate 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaddwd))]
 pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -152,7 +152,7 @@ pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaxsw))]
 pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -161,7 +161,7 @@ pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaxub))]
 pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -170,7 +170,7 @@ pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pminsw))]
 pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -179,7 +179,7 @@ pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pminub))]
 pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -190,7 +190,7 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmulhw))]
 pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -201,7 +201,7 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmulhuw))]
 pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -212,7 +212,7 @@ pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// low 16 bits of the intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmullw))]
 pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -223,7 +223,7 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// in `a` and `b`.
 ///
 /// Return the unsigned 64-bit results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmuludq))]
 pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
@@ -236,7 +236,7 @@ pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// and `b`, then horizontally sum each consecutive 8 differences to produce
 /// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
 /// the low 16 bits of 64-bit elements returned.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psadbw))]
 pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -244,7 +244,7 @@ pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubb))]
 pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -252,7 +252,7 @@ pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubw))]
 pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -260,7 +260,7 @@ pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubd))]
 pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -268,7 +268,7 @@ pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubq))]
 pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -277,7 +277,7 @@ pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubsb))]
 pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -286,7 +286,7 @@ pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubsw))]
 pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -295,7 +295,7 @@ pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubusb))]
 pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -304,7 +304,7 @@ pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubusw))]
 pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -312,7 +312,7 @@ pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -355,7 +355,7 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -363,7 +363,7 @@ pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -371,7 +371,7 @@ pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllw))]
 pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -380,7 +380,7 @@ pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` left by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllw))]
 pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
@@ -388,7 +388,7 @@ pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
 }
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslld))]
 pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -397,7 +397,7 @@ pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` left by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslld))]
 pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -405,7 +405,7 @@ pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
 }
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllq))]
 pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
@@ -414,7 +414,7 @@ pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` left by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllq))]
 pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -423,7 +423,7 @@ pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psraw))]
 pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -432,7 +432,7 @@ pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psraw))]
 pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
@@ -441,7 +441,7 @@ pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrad))]
 pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -450,7 +450,7 @@ pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrad))]
 pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -458,7 +458,7 @@ pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
 }
 
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -502,7 +502,7 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlw))]
 pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -511,7 +511,7 @@ pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlw))]
 pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
@@ -520,7 +520,7 @@ pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrld))]
 pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -529,7 +529,7 @@ pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrld))]
 pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -538,7 +538,7 @@ pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlq))]
 pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
@@ -547,7 +547,7 @@ pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlq))]
 pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -556,7 +556,7 @@ pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andps))]
 pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -565,7 +565,7 @@ pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compute the bitwise NOT of 128 bits (representing integer data) in `a` and
 /// then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andnps))]
 pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -574,7 +574,7 @@ pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(orps))]
 pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -583,7 +583,7 @@ pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -591,7 +591,7 @@ pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqb))]
 pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -599,7 +599,7 @@ pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqw))]
 pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -607,7 +607,7 @@ pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -615,7 +615,7 @@ pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
 pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -623,7 +623,7 @@ pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
 pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -631,7 +631,7 @@ pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
 pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -639,7 +639,7 @@ pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
 pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -647,7 +647,7 @@ pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
 pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -655,7 +655,7 @@ pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
 pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -664,7 +664,7 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert the lower two packed 32-bit integers in `a` to packed
 /// double-precision (64-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtdq2pd))]
 pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
@@ -674,7 +674,7 @@ pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
@@ -683,7 +683,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
 
 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtdq2ps))]
 pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
@@ -692,7 +692,7 @@ pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtps2dq))]
 pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
@@ -701,7 +701,7 @@ pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))]
 pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
@@ -709,7 +709,7 @@ pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
 }
 
 /// Return the lowest element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movd))] // FIXME mov on windows
 pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
@@ -718,7 +718,7 @@ pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
 
 /// Set packed 64-bit integers with the supplied values, from highest to
 /// lowest.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
@@ -726,7 +726,7 @@ pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
 }
 
 /// Set packed 32-bit integers with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
@@ -734,7 +734,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 }
 
 /// Set packed 16-bit integers with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi16(
@@ -744,7 +744,7 @@ pub unsafe fn _mm_set_epi16(
 }
 
 /// Set packed 8-bit integers with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi8(
@@ -758,7 +758,7 @@ pub unsafe fn _mm_set_epi8(
 }
 
 /// Broadcast 64-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
@@ -766,7 +766,7 @@ pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
 }
 
 /// Broadcast 32-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
@@ -774,7 +774,7 @@ pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
 }
 
 /// Broadcast 16-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
@@ -782,7 +782,7 @@ pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
 }
 
 /// Broadcast 8-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
@@ -790,7 +790,7 @@ pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
 }
 
 /// Set packed 32-bit integers with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
@@ -798,7 +798,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 }
 
 /// Set packed 16-bit integers with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi16(
@@ -808,7 +808,7 @@ pub unsafe fn _mm_setr_epi16(
 }
 
 /// Set packed 8-bit integers with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi8(
@@ -822,7 +822,7 @@ pub unsafe fn _mm_setr_epi8(
 }
 
 /// Returns a vector with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_setzero_si128() -> __m128i {
@@ -830,7 +830,7 @@ pub unsafe fn _mm_setzero_si128() -> __m128i {
 }
 
 /// Load 64-bit integer from memory into first element of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movsd on windows
 #[cfg_attr(all(test, not(windows),
@@ -844,7 +844,7 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
 /// Load 128-bits of integer data from memory into a new vector.
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
@@ -854,7 +854,7 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
 /// Load 128-bits of integer data from memory into a new vector.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
@@ -875,7 +875,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
 ///
 /// `mem_addr` should correspond to a 128-bit memory location and does not need
 /// to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maskmovdqu))]
 pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
@@ -885,7 +885,7 @@ pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8)
 /// Store 128-bits of integer data from `a` into memory.
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
@@ -895,7 +895,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Store 128-bits of integer data from `a` into memory.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
 pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
@@ -905,7 +905,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Store the lower 64-bit integer `a` to a memory location.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // FIXME mov on windows, movlps on i686
 #[cfg_attr(all(test, not(windows),
@@ -923,7 +923,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
 pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
@@ -933,7 +933,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Stores a 32-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
 pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
@@ -942,7 +942,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 
 /// Return a vector where the low element is extracted from `a` and its upper
 /// element is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movd on windows, movd on i686
 #[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
@@ -955,7 +955,7 @@ pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packsswb))]
 pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -964,7 +964,7 @@ pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packssdw))]
 pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -973,7 +973,7 @@ pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packuswb))]
 pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -981,7 +981,7 @@ pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Return the `imm8` element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pextrw, imm8 = 9))]
 pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
@@ -989,7 +989,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
 }
 
 /// Return a new vector where the `imm8` element of `a` is replaced with `i`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pinsrw, imm8 = 9))]
 pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
@@ -997,7 +997,7 @@ pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
 }
 
 /// Return a mask of the most significant bit of each element in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmovmskb))]
 pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
@@ -1005,7 +1005,7 @@ pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
 }
 
 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshufd, imm8 = 9))]
 pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -1068,7 +1068,7 @@ pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
 ///
 /// Put the results in the high 64 bits of the returned vector, with the low 64
 /// bits being copied from from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshufhw, imm8 = 9))]
 pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -1126,7 +1126,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
 ///
 /// Put the results in the low 64 bits of the returned vector, with the high 64
 /// bits being copied from from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshuflw, imm8 = 9))]
 pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -1179,7 +1179,7 @@ pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhbw))]
 pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -1191,7 +1191,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhwd))]
 pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -1200,7 +1200,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhdq))]
 pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -1208,7 +1208,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhqdq))]
 pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -1216,7 +1216,7 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklbw))]
 pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -1228,7 +1228,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklwd))]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -1237,7 +1237,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckldq))]
 pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -1245,7 +1245,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklqdq))]
 pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -1254,7 +1254,7 @@ pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 /// Return a new vector with the low element of `a` replaced by the sum of the
 /// low elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(addsd))]
 pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1263,7 +1263,7 @@ pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Add packed double-precision (64-bit) floating-point elements in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(addpd))]
 pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1272,7 +1272,7 @@ pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the result of
 /// diving the lower element of `a` by the lower element of `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(divsd))]
 pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1281,7 +1281,7 @@ pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Divide packed double-precision (64-bit) floating-point elements in `a` by
 /// packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(divpd))]
 pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1290,7 +1290,7 @@ pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the maximum
 /// of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maxsd))]
 pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1299,7 +1299,7 @@ pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the maximum values from corresponding elements in
 /// `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maxpd))]
 pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1308,7 +1308,7 @@ pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the minimum
 /// of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(minsd))]
 pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1317,7 +1317,7 @@ pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the minimum values from corresponding elements in
 /// `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(minpd))]
 pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1326,7 +1326,7 @@ pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by multiplying the
 /// low elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mulsd))]
 pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1335,7 +1335,7 @@ pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Multiply packed double-precision (64-bit) floating-point elements in `a`
 /// and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mulpd))]
 pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1344,7 +1344,7 @@ pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the square
 /// root of the lower element `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(sqrtsd))]
 pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1352,7 +1352,7 @@ pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Return a new vector with the square root of each of the values in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(sqrtpd))]
 pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
@@ -1361,7 +1361,7 @@ pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by subtracting the
 /// low element by `b` from the low element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(subsd))]
 pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1370,7 +1370,7 @@ pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(subpd))]
 pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1379,7 +1379,7 @@ pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andps))]
 pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1389,7 +1389,7 @@ pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise NOT of `a` and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andnps))]
 pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1399,7 +1399,7 @@ pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise OR of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(orps))]
 pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1409,7 +1409,7 @@ pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise OR of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1420,7 +1420,7 @@ pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the equality
 /// comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpeqsd))]
 pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1429,7 +1429,7 @@ pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the less-than
 /// comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltsd))]
 pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1438,7 +1438,7 @@ pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// less-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplesd))]
 pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1447,7 +1447,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltsd))]
 pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1456,7 +1456,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplesd))]
 pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1467,7 +1467,7 @@ pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
 /// of comparing both of the lower elements of `a` and `b` to `NaN`. If
 /// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpordsd))]
 pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1477,7 +1477,7 @@ pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
 /// Return a new vector with the low element of `a` replaced by the result of
 /// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
 /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpunordsd))]
 pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1486,7 +1486,7 @@ pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the not-equal
 /// comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpneqsd))]
 pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1495,7 +1495,7 @@ pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1504,7 +1504,7 @@ pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1513,7 +1513,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1522,7 +1522,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1530,7 +1530,7 @@ pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpeqpd))]
 pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1538,7 +1538,7 @@ pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltpd))]
 pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1546,7 +1546,7 @@ pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for less-than-or-equal
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplepd))]
 pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1554,7 +1554,7 @@ pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltpd))]
 pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1562,7 +1562,7 @@ pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplepd))]
 pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1570,7 +1570,7 @@ pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpordpd))]
 pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1578,7 +1578,7 @@ pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpunordpd))]
 pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1586,7 +1586,7 @@ pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpneqpd))]
 pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1594,7 +1594,7 @@ pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltpd))]
 pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1602,7 +1602,7 @@ pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlepd))]
 pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1610,7 +1610,7 @@ pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltpd))]
 pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1619,7 +1619,7 @@ pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Compare corresponding elements in `a` and `b` for
 /// not-greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlepd))]
 pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1627,7 +1627,7 @@ pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare the lower element of `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1635,7 +1635,7 @@ pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1643,7 +1643,7 @@ pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> bool {
@@ -1651,7 +1651,7 @@ pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1659,7 +1659,7 @@ pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> bool {
@@ -1667,7 +1667,7 @@ pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1675,7 +1675,7 @@ pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1683,7 +1683,7 @@ pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1691,7 +1691,7 @@ pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> bool {
@@ -1699,7 +1699,7 @@ pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1707,7 +1707,7 @@ pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> bool {
@@ -1715,7 +1715,7 @@ pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1724,7 +1724,7 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> bool {
 
 /// Convert packed double-precision (64-bit) floating-point elements in "a" to
 /// packed single-precision (32-bit) floating-point elements
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtpd2ps))]
 pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
@@ -1734,7 +1734,7 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
 /// Convert packed single-precision (32-bit) floating-point elements in `a` to
 /// packed
 /// double-precision (64-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtps2pd))]
 pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
@@ -1743,7 +1743,7 @@ pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a` to
 /// packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtpd2dq))]
 pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
@@ -1752,7 +1752,7 @@ pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 32-bit integer.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
 pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
@@ -1763,7 +1763,7 @@ pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
 /// to a single-precision (32-bit) floating-point element, store the result in
 /// the lower element of the return value, and copy the upper element from `a`
 /// to the upper element the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2ss))]
 pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
@@ -1771,7 +1771,7 @@ pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
 }
 
 /// Return the lower double-precision (64-bit) floating-point element of "a".
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, windows), assert_instr(movsd))] // FIXME movq/movlps/mov on other platform
 pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
@@ -1782,7 +1782,7 @@ pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
 /// to a double-precision (64-bit) floating-point element, store the result in
 /// the lower element of the return value, and copy the upper element from `a`
 /// to the upper element the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtss2sd))]
 pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
@@ -1791,7 +1791,7 @@ pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a` to
 /// packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttpd2dq))]
 pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
@@ -1800,7 +1800,7 @@ pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
 
 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 32-bit integer with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
 pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
@@ -1809,7 +1809,7 @@ pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a` to
 /// packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttps2dq))]
 pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
@@ -1818,7 +1818,7 @@ pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
 
 /// Copy double-precision (64-bit) floating-point element `a` to the lower
 /// element of the packed 64-bit return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
     _mm_set_pd(0.0, a)
@@ -1826,7 +1826,7 @@ pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
 
 /// Broadcast double-precision (64-bit) floating-point value a to all elements
 /// of the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
     _mm_set_pd(a, a)
@@ -1834,7 +1834,7 @@ pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
 
 /// Broadcast double-precision (64-bit) floating-point value a to all elements
 /// of the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
     _mm_set_pd(a, a)
@@ -1842,7 +1842,7 @@ pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
 
 /// Set packed double-precision (64-bit) floating-point elements in the return
 /// value with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
     __m128d(b, a)
@@ -1850,7 +1850,7 @@ pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
 
 /// Set packed double-precision (64-bit) floating-point elements in the return
 /// value with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
     _mm_set_pd(b, a)
@@ -1858,7 +1858,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
 
 /// Returns packed double-precision (64-bit) floating-point elements with all
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected
 pub unsafe fn _mm_setzero_pd() -> __m128d {
@@ -1869,7 +1869,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
 ///
 /// The mask is stored in the 2 least significant bits of the return value.
 /// All other bits are set to `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movmskpd))]
 pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
@@ -1880,7 +1880,7 @@ pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
@@ -1889,7 +1889,7 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
 
 /// Loads a 64-bit double-precision value to the low element of a
 /// 128-bit integer vector and clears the upper element.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movsd))]
 pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
@@ -1899,7 +1899,7 @@ pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
 /// Loads a double-precision value into the high-order bits of a 128-bit
 /// vector of [2 x double]. The low-order bits are copied from the low-order
 /// bits of the first operand.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movhpd))]
 pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
@@ -1909,7 +1909,7 @@ pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 /// Loads a double-precision value into the low-order bits of a 128-bit
 /// vector of [2 x double]. The high-order bits are copied from the
 /// high-order bits of the first operand.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlpd))]
 pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
@@ -1920,7 +1920,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 /// aligned memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
 pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1929,7 +1929,7 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movsd only on windows
 pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
@@ -1939,7 +1939,7 @@ pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
 /// Store 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory. `mem_addr` must be aligned
 /// on a 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1949,7 +1949,7 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
 pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1959,7 +1959,7 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [0, 0]);
@@ -1969,7 +1969,7 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [0, 0]);
@@ -1980,7 +1980,7 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
 /// memory in reverse order.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [1, 0]);
@@ -1989,7 +1989,7 @@ pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movhpd))]
 pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1998,7 +1998,7 @@ pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movlpd (movsd on windows)
 pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
@@ -2007,7 +2007,7 @@ pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
 pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
@@ -2017,7 +2017,7 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
 pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
@@ -2027,7 +2027,7 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
 /// Load 2 double-precision (64-bit) floating-point elements from memory into
 /// the returned vector in reverse order. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movapd))]
 pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
@@ -2038,7 +2038,7 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
 /// Load 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
@@ -2054,7 +2054,7 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
 /// Constructs a 128-bit floating-point vector of [2 x double] from two
 /// 128-bit vector parameters of [2 x double], using the immediate-value
 /// parameter as a specifier.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(shufpd, imm8 = 1))]
 pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -2069,7 +2069,7 @@ pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
 /// 64 bits are set to the upper 64 bits of the first parameter.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movsd))]
 pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -2078,7 +2078,7 @@ pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 /// floating-point vector of [4 x float].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
     mem::transmute(a)
@@ -2086,7 +2086,7 @@ pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 /// integer vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
     mem::transmute::<i64x2, _>(simd_cast(a))
@@ -2094,7 +2094,7 @@ pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
 
 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
 /// floating-point vector of [2 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
     mem::transmute(a)
@@ -2102,7 +2102,7 @@ pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
 
 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
 /// integer vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
     mem::transmute(a)
@@ -2110,7 +2110,7 @@ pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
 
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of [2 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
     simd_cast(a.as_i64x2())
@@ -2118,21 +2118,21 @@ pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
 
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of [4 x float].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
     mem::transmute(a)
 }
 
 /// Return vector of type __m128d with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_undefined_pd() -> __m128d {
     _mm_set1_pd(mem::uninitialized())
 }
 
 /// Return vector of type __m128i with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_undefined_si128() -> __m128i {
     _mm_set1_epi8(mem::uninitialized())
@@ -2143,7 +2143,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 ///
 /// * The [127:64] bits are copied from the [127:64] bits of the second input
 /// * The [63:0] bits are copied from the [127:64] bits of the first input
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpckhpd))]
 pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -2155,7 +2155,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 ///
 /// * The [127:64] bits are copied from the [63:0] bits of the second input
 /// * The [63:0] bits are copied from the [63:0] bits of the first input
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpcklpd))]
 pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
diff --git a/library/stdarch/coresimd/src/x86/i586/sse3.rs b/library/stdarch/coresimd/src/x86/i586/sse3.rs
index cf26319612c0..f74341dffef2 100644
--- a/library/stdarch/coresimd/src/x86/i586/sse3.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse3.rs
@@ -9,7 +9,7 @@ use stdsimd_test::assert_instr;
 
 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubps))]
 pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
@@ -18,7 +18,7 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubpd))]
 pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -27,7 +27,7 @@ pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Horizontally add adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddpd))]
 pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -36,7 +36,7 @@ pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddps))]
 pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
@@ -45,7 +45,7 @@ pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubpd))]
 pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -54,7 +54,7 @@ pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubps))]
 pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
@@ -64,7 +64,7 @@ pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
 /// Load 128-bits of integer data from unaligned memory.
 /// This intrinsic may perform better than `_mm_loadu_si128`
 /// when the data crosses a cache line boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(lddqu))]
 pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
@@ -73,7 +73,7 @@ pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
 
 /// Duplicate the low double-precision (64-bit) floating-point element
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
 pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
@@ -82,7 +82,7 @@ pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of return vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
 pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
@@ -91,7 +91,7 @@ pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movshdup))]
 pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
@@ -100,7 +100,7 @@ pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movsldup))]
 pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
diff --git a/library/stdarch/coresimd/src/x86/i586/sse41.rs b/library/stdarch/coresimd/src/x86/i586/sse41.rs
index bd63ed312202..4ca8397bd82d 100644
--- a/library/stdarch/coresimd/src/x86/i586/sse41.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse41.rs
@@ -47,7 +47,7 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 /// The high bit of each corresponding mask byte determines the selection.
 /// If the high bit is set the element of `a` is selected. The element
 /// of `b` is selected otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendvb))]
 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
@@ -59,7 +59,7 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i
 /// The mask bits determine the selection. A clear bit selects the
 /// corresponding element of `a`, and a set bit the corresponding
 /// element of `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
@@ -73,7 +73,7 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using `mask`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvpd))]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
@@ -82,7 +82,7 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using `mask`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvps))]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
@@ -91,7 +91,7 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using control mask `imm2`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
@@ -103,7 +103,7 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using mask `imm4`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
@@ -115,7 +115,7 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
 
 /// Extract a single-precision (32-bit) floating-point element from `a`,
 /// selected with `imm8`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
@@ -127,7 +127,7 @@ pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
@@ -136,7 +136,7 @@ pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))]
@@ -167,7 +167,7 @@ pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
 ///
 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 /// element is cleared.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -179,7 +179,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 
 /// Return a copy of `a` with the 8-bit integer from `i` inserted at a
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i {
@@ -188,7 +188,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i {
 
 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
@@ -197,7 +197,7 @@ pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
 /// values in dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsb))]
 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -206,7 +206,7 @@ pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// maximum.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxuw))]
 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -215,7 +215,7 @@ pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
 /// values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsd))]
 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -224,7 +224,7 @@ pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxud))]
 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
@@ -233,7 +233,7 @@ pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed minimum
 /// values in dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsb))]
 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -242,7 +242,7 @@ pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// minimum.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminuw))]
 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -251,7 +251,7 @@ pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum
 /// values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsd))]
 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -260,7 +260,7 @@ pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminud))]
 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
@@ -269,7 +269,7 @@ pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(packusdw))]
 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -277,7 +277,7 @@ pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqq))]
 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -285,7 +285,7 @@ pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbw))]
 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
@@ -295,7 +295,7 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbd))]
 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
@@ -306,7 +306,7 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 /// 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbq))]
 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
@@ -316,7 +316,7 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwd))]
 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
@@ -326,7 +326,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwq))]
 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
@@ -336,7 +336,7 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxdq))]
 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
@@ -346,7 +346,7 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbw))]
 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
@@ -356,7 +356,7 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbd))]
 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
@@ -366,7 +366,7 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbq))]
 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
@@ -377,7 +377,7 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwd))]
 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
@@ -388,7 +388,7 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwq))]
 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
@@ -399,7 +399,7 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 32-bit integers in `a`
 /// to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxdq))]
 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
@@ -415,7 +415,7 @@ pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -432,7 +432,7 @@ pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -445,7 +445,7 @@ pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed double-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
@@ -455,7 +455,7 @@ pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed single-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
@@ -467,7 +467,7 @@ pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper element from `a` to the upper element of the intrinsic
 /// result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -479,7 +479,7 @@ pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
@@ -489,7 +489,7 @@ pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed double-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
@@ -499,7 +499,7 @@ pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed single-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
@@ -511,7 +511,7 @@ pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrisic result,
 /// and copy the upper element from `a` to the upper element
 /// of the intrinsic result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -523,7 +523,7 @@ pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
@@ -549,7 +549,7 @@ pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
@@ -578,7 +578,7 @@ pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
@@ -609,7 +609,7 @@ pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
@@ -640,7 +640,7 @@ pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
@@ -669,7 +669,7 @@ pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 /// * bits `[18:16]` - contain the index of the minimum value
 /// * remaining bits are set to `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(phminposuw))]
 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
@@ -678,7 +678,7 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 
 /// Multiply the low 32-bit integers from each packed 64-bit
 /// element in `a` and `b`, and return the signed 64-bit result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmuldq))]
 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -691,7 +691,7 @@ pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would return a
 /// negative number.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmulld))]
 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -729,7 +729,7 @@ pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// * A `__m128i` vector containing the sums of the sets of
 ///   absolute differences between both operands.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
diff --git a/library/stdarch/coresimd/src/x86/i586/sse42.rs b/library/stdarch/coresimd/src/x86/i586/sse42.rs
index f358426d3113..f850306d2943 100644
--- a/library/stdarch/coresimd/src/x86/i586/sse42.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse42.rs
@@ -48,7 +48,7 @@ pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return the generated mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
 pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
@@ -258,7 +258,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpestri`]: fn._mm_cmpestri.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -273,7 +273,7 @@ pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if any character in `b` was null.
 /// and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -288,7 +288,7 @@ pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if the resulting mask was non-zero,
 /// and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -303,7 +303,7 @@ pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and returns `1` if any character in `a` was null,
 /// and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -317,7 +317,7 @@ pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return bit `0` of the resulting bit mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -332,7 +332,7 @@ pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if `b` did not contain a null
 /// character and the resulting mask was zero, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -346,7 +346,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return the generated mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
 pub unsafe fn _mm_cmpestrm(
@@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpestrm(
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpistri`]: fn._mm_cmpistri.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestri(
@@ -456,7 +456,7 @@ pub unsafe fn _mm_cmpestri(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// `b` was null, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrz(
@@ -473,7 +473,7 @@ pub unsafe fn _mm_cmpestrz(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if the resulting mask
 /// was non-zero, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrc(
@@ -490,7 +490,7 @@ pub unsafe fn _mm_cmpestrc(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// a was null, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrs(
@@ -507,7 +507,7 @@ pub unsafe fn _mm_cmpestrs(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return bit `0` of the resulting
 /// bit mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestro(
@@ -525,7 +525,7 @@ pub unsafe fn _mm_cmpestro(
 /// using the control in `imm8`, and return `1` if `b` did not
 /// contain a null character and the resulting mask was zero, and `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestra(
@@ -541,7 +541,7 @@ pub unsafe fn _mm_cmpestra(
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 8-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
@@ -550,7 +550,7 @@ pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 16-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
@@ -559,7 +559,7 @@ pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 32-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
diff --git a/library/stdarch/coresimd/src/x86/i586/ssse3.rs b/library/stdarch/coresimd/src/x86/i586/ssse3.rs
index f0498ef53176..01e461bd79ed 100644
--- a/library/stdarch/coresimd/src/x86/i586/ssse3.rs
+++ b/library/stdarch/coresimd/src/x86/i586/ssse3.rs
@@ -11,7 +11,7 @@ use x86::*;
 
 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsb))]
 pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
@@ -21,7 +21,7 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 16-bit signed integers in
 /// `a` and
 /// return the 16-bit unsigned integer
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsw))]
 pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
@@ -31,7 +31,7 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 32-bit signed integers in
 /// `a` and
 /// return the 32-bit unsigned integer
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsd))]
 pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
@@ -62,7 +62,7 @@ pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///     r
 /// }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pshufb))]
 pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -71,7 +71,7 @@ pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
 /// shift the result right by `n` bytes, and return the low 16 bytes.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
 pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
@@ -129,7 +129,7 @@ pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddw))]
 pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -139,7 +139,7 @@ pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddsw))]
 pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -148,7 +148,7 @@ pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [4 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddd))]
 pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -157,7 +157,7 @@ pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [8 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubw))]
 pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -168,7 +168,7 @@ pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// packed 128-bit vectors of [8 x i16]. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubsw))]
 pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -177,7 +177,7 @@ pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [4 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubd))]
 pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -189,7 +189,7 @@ pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// integer values contained in the second source operand, add pairs of
 /// contiguous products with signed saturation, and writes the 16-bit sums to
 /// the corresponding bits in the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
 pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -199,7 +199,7 @@ pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed 16-bit signed integer values, truncate the 32-bit
 /// product to the 18 most significant bits by right-shifting, round the
 /// truncated value by adding 1, and write bits [16:1] to the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
 pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -210,7 +210,7 @@ pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the result.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignb))]
 pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -221,7 +221,7 @@ pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignw))]
 pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -232,7 +232,7 @@ pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b`
 /// is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignd))]
 pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
diff --git a/library/stdarch/coresimd/src/x86/i586/tbm.rs b/library/stdarch/coresimd/src/x86/i586/tbm.rs
index 30019673a23b..1a9b48ca29fa 100644
--- a/library/stdarch/coresimd/src/x86/i586/tbm.rs
+++ b/library/stdarch/coresimd/src/x86/i586/tbm.rs
@@ -27,7 +27,7 @@ extern "C" {
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
     _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
@@ -35,7 +35,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
     _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
@@ -46,7 +46,7 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
     unsafe { x86_tbm_bextri_u32(a, control) }
@@ -57,7 +57,7 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
     unsafe { x86_tbm_bextri_u64(a, control) }
@@ -67,7 +67,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
 /// Clears all bits below the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 pub unsafe fn _blcfill_u32(x: u32) -> u32 {
@@ -77,7 +77,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 {
 /// Clears all bits below the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -88,7 +88,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 {
 /// Sets all bits of `x` to 1 except for the least significant zero bit.
 ///
 /// If there is no zero bit in `x`, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 pub unsafe fn _blci_u32(x: u32) -> u32 {
@@ -98,7 +98,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 {
 /// Sets all bits of `x` to 1 except for the least significant zero bit.
 ///
 /// If there is no zero bit in `x`, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -109,7 +109,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 {
 /// Sets the least significant zero bit of `x` and clears all other bits.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 pub unsafe fn _blcic_u32(x: u32) -> u32 {
@@ -119,7 +119,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 {
 /// Sets the least significant zero bit of `x` and clears all other bits.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -131,7 +131,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
 /// that bit.
 ///
 /// If there is no zero bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
@@ -142,7 +142,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
 /// that bit.
 ///
 /// If there is no zero bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -153,7 +153,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
 /// Sets the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns `x`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 pub unsafe fn _blcs_u32(x: u32) -> u32 {
@@ -163,7 +163,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 {
 /// Sets the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns `x`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -174,7 +174,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 {
 /// Sets all bits of `x` below the least significant one.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 pub unsafe fn _blsfill_u32(x: u32) -> u32 {
@@ -184,7 +184,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 {
 /// Sets all bits of `x` below the least significant one.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -195,7 +195,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 {
 /// Clears least significant bit and sets all other bits.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 pub unsafe fn _blsic_u32(x: u32) -> u32 {
@@ -205,7 +205,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 {
 /// Clears least significant bit and sets all other bits.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -217,7 +217,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 0, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
@@ -228,7 +228,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 0, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -240,7 +240,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 1, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
@@ -251,7 +251,7 @@ pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 1, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
diff --git a/library/stdarch/coresimd/src/x86/i586/xsave.rs b/library/stdarch/coresimd/src/x86/i586/xsave.rs
index 9a7611a82efc..ead6cd09f7e9 100644
--- a/library/stdarch/coresimd/src/x86/i586/xsave.rs
+++ b/library/stdarch/coresimd/src/x86/i586/xsave.rs
@@ -33,7 +33,7 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave))]
 pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
@@ -46,7 +46,7 @@ pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor))]
 pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
@@ -62,7 +62,7 @@ const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
 /// by `a`.
 ///
 /// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsetbv))]
 pub unsafe fn _xsetbv(a: u32, val: u64) {
@@ -71,7 +71,7 @@ pub unsafe fn _xsetbv(a: u32, val: u64) {
 
 /// Reads the contents of the extended control register `XCR`
 /// specified in `xcr_no`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xgetbv))]
 pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
@@ -85,7 +85,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE` instruction.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt))]
 pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
@@ -98,7 +98,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec))]
 pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
@@ -112,7 +112,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves))]
 pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
@@ -128,7 +128,7 @@ pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors))]
 pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
diff --git a/library/stdarch/coresimd/src/x86/i686/mmx.rs b/library/stdarch/coresimd/src/x86/i686/mmx.rs
index e013fb3dc62a..c5d69bcec034 100644
--- a/library/stdarch/coresimd/src/x86/i686/mmx.rs
+++ b/library/stdarch/coresimd/src/x86/i686/mmx.rs
@@ -16,7 +16,7 @@ use core::mem;
 use stdsimd_test::assert_instr;
 
 /// Constructs a 64-bit integer vector initialized to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 // FIXME: this produces a movl instead of xorps on x86
 // FIXME: this produces a xor intrinsic instead of xorps on x86_64
@@ -26,7 +26,7 @@ pub unsafe fn _mm_setzero_si64() -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddb))]
 pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 {
@@ -34,7 +34,7 @@ pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddb))]
 pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 {
@@ -42,7 +42,7 @@ pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddw))]
 pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 {
@@ -50,7 +50,7 @@ pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddw))]
 pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 {
@@ -58,7 +58,7 @@ pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddd))]
 pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 {
@@ -66,7 +66,7 @@ pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddd))]
 pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 {
@@ -74,7 +74,7 @@ pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsb))]
 pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 {
@@ -82,7 +82,7 @@ pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsb))]
 pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 {
@@ -90,7 +90,7 @@ pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsw))]
 pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 {
@@ -98,7 +98,7 @@ pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsw))]
 pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 {
@@ -106,7 +106,7 @@ pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusb))]
 pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 {
@@ -114,7 +114,7 @@ pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusb))]
 pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 {
@@ -122,7 +122,7 @@ pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusw))]
 pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 {
@@ -130,7 +130,7 @@ pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusw))]
 pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 {
@@ -138,7 +138,7 @@ pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubb))]
 pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 {
@@ -146,7 +146,7 @@ pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubb))]
 pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 {
@@ -154,7 +154,7 @@ pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubw))]
 pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 {
@@ -162,7 +162,7 @@ pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubw))]
 pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 {
@@ -170,7 +170,7 @@ pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubd))]
 pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 {
@@ -178,7 +178,7 @@ pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubd))]
 pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 {
@@ -187,7 +187,7 @@ pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsb))]
 pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 {
@@ -196,7 +196,7 @@ pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsb))]
 pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 {
@@ -205,7 +205,7 @@ pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsw))]
 pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -214,7 +214,7 @@ pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsw))]
 pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 {
@@ -223,7 +223,7 @@ pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusb))]
 pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 {
@@ -232,7 +232,7 @@ pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusb))]
 pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 {
@@ -241,7 +241,7 @@ pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned
 /// 16-bit integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusw))]
 pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 {
@@ -250,7 +250,7 @@ pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned
 /// 16-bit integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusw))]
 pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 {
@@ -262,7 +262,7 @@ pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 {
 ///
 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
 /// less than 0x80 are saturated to 0x80.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(packsswb))]
 pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -274,7 +274,7 @@ pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
 ///
 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
 /// less than 0x80 are saturated to 0x80.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(packssdw))]
 pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
@@ -283,7 +283,7 @@ pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
 
 /// Compares whether each element of `a` is greater than the corresponding
 /// element of `b` returning `0` for `false` and `-1` for `true`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
 pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
@@ -292,7 +292,7 @@ pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Compares whether each element of `a` is greater than the corresponding
 /// element of `b` returning `0` for `false` and `-1` for `true`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
 pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
@@ -301,7 +301,7 @@ pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Compares whether each element of `a` is greater than the corresponding
 /// element of `b` returning `0` for `false` and `-1` for `true`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
 pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 {
@@ -310,7 +310,7 @@ pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the upper two elements from two `i16x4` vectors and interleaves
 /// them into the result: `[a.2, b.2, a.3, b.3]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected
 pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
@@ -319,7 +319,7 @@ pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the upper four elements from two `i8x8` vectors and interleaves
 /// them into the result: `[a.4, b.4, a.5, b.5, a.6, b.6, a.7, b.7]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckhbw))]
 pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 {
@@ -328,7 +328,7 @@ pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the lower four elements from two `i8x8` vectors and interleaves
 /// them into the result: `[a.0, b.0, a.1, b.1, a.2, b.2, a.3, b.3]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpcklbw))]
 pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
@@ -337,7 +337,7 @@ pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the lower two elements from two `i16x4` vectors and interleaves
 /// them into the result: `[a.0 b.0 a.1 b.1]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpcklwd))]
 pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
@@ -346,7 +346,7 @@ pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the upper element from two `i32x2` vectors and interleaves them
 /// into the result: `[a.1, b.1]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckhdq))]
 pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 {
@@ -355,7 +355,7 @@ pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the lower element from two `i32x2` vectors and interleaves them
 /// into the result: `[a.0, b.0]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckldq))]
 pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 {
@@ -363,21 +363,21 @@ pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Set packed 16-bit integers in dst with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi16(e3: i16, e2: i16, e1: i16, e0: i16) -> __m64 {
     _mm_setr_pi16(e0, e1, e2, e3)
 }
 
 /// Set packed 32-bit integers in dst with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 {
     _mm_setr_pi32(e0, e1)
 }
 
 /// Set packed 8-bit integers in dst with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi8(
     e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8
@@ -386,21 +386,21 @@ pub unsafe fn _mm_set_pi8(
 }
 
 /// Broadcast 16-bit integer a to all all elements of dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set1_pi16(a: i16) -> __m64 {
     _mm_setr_pi16(a, a, a, a)
 }
 
 /// Broadcast 32-bit integer a to all all elements of dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set1_pi32(a: i32) -> __m64 {
     _mm_setr_pi32(a, a)
 }
 
 /// Broadcast 8-bit integer a to all all elements of dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 {
     _mm_setr_pi8(a, a, a, a, a, a, a, a)
@@ -408,7 +408,7 @@ pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 {
 
 /// Set packed 16-bit integers in dst with the supplied values in reverse
 /// order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 {
     mem::transmute(i16x4::new(e0, e1, e2, e3))
@@ -416,14 +416,14 @@ pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 {
 
 /// Set packed 32-bit integers in dst with the supplied values in reverse
 /// order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 {
     mem::transmute(i32x2::new(e0, e1))
 }
 
 /// Set packed 8-bit integers in dst with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi8(
     e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8
diff --git a/library/stdarch/coresimd/src/x86/i686/sse2.rs b/library/stdarch/coresimd/src/x86/i686/sse2.rs
index f82c4372a963..ba34838c0326 100644
--- a/library/stdarch/coresimd/src/x86/i686/sse2.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse2.rs
@@ -10,7 +10,7 @@ use stdsimd_test::assert_instr;
 
 /// Adds two signed or unsigned 64-bit integer values, returning the
 /// lower 64 bits of the sum.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(paddq))]
 pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
@@ -20,7 +20,7 @@ pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
 /// Multiplies 32-bit unsigned integer values contained in the lower bits
 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
 /// product.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(pmuludq))]
 pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
@@ -29,7 +29,7 @@ pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
 
 /// Subtracts signed or unsigned 64-bit integer values and writes the
 /// difference to the corresponding bits in the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(psubq))]
 pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
@@ -39,7 +39,7 @@ pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
 /// [2 x i32] into two double-precision floating-point values, returned in a
 /// 128-bit vector of [2 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(cvtpi2pd))]
 pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
@@ -48,7 +48,7 @@ pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
 
 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
 /// the specified 64-bit integer values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
@@ -57,7 +57,7 @@ pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
 
 /// Initializes both values in a 128-bit vector of [2 x i64] with the
 /// specified 64-bit value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
@@ -66,7 +66,7 @@ pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
 
 /// Constructs a 128-bit integer vector, initialized in reverse order
 /// with the specified 64-bit integral values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
@@ -75,7 +75,7 @@ pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
 
 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
 /// integer.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong
 // instr?
@@ -85,7 +85,7 @@ pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 {
 
 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
 /// upper bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong
 // instr?
@@ -96,7 +96,7 @@ pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i {
 /// Converts the two double-precision floating-point elements of a
 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
 /// returned in a 64-bit vector of [2 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(cvtpd2pi))]
 pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
@@ -108,7 +108,7 @@ pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
 /// returned in a 64-bit vector of [2 x i32].
 /// If the result of either conversion is inexact, the result is truncated
 /// (rounded towards zero) regardless of the current MXCSR setting.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(cvttpd2pi))]
 pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 {
diff --git a/library/stdarch/coresimd/src/x86/i686/sse41.rs b/library/stdarch/coresimd/src/x86/i686/sse41.rs
index a8dd65cfe021..3f35305b1874 100644
--- a/library/stdarch/coresimd/src/x86/i686/sse41.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse41.rs
@@ -29,7 +29,7 @@ extern "C" {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
@@ -49,7 +49,7 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all ones,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
@@ -69,7 +69,7 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
@@ -89,7 +89,7 @@ pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
@@ -107,7 +107,7 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the bits specified in the operand are all set to 1,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 #[cfg_attr(test, assert_instr(ptest))]
@@ -128,7 +128,7 @@ pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
diff --git a/library/stdarch/coresimd/src/x86/i686/sse42.rs b/library/stdarch/coresimd/src/x86/i686/sse42.rs
index 301dd3ea77bc..f092fe412f9c 100644
--- a/library/stdarch/coresimd/src/x86/i686/sse42.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse42.rs
@@ -9,7 +9,7 @@ use stdsimd_test::assert_instr;
 
 /// Compare packed 64-bit integers in `a` and `b` for greater-than,
 /// return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpgtq))]
 pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
diff --git a/library/stdarch/coresimd/src/x86/i686/sse4a.rs b/library/stdarch/coresimd/src/x86/i686/sse4a.rs
index f35ffb3e501a..5e226322f8c5 100644
--- a/library/stdarch/coresimd/src/x86/i686/sse4a.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse4a.rs
@@ -33,7 +33,7 @@ extern "C" {
 ///
 /// If `length == 0 && index > 0` or `lenght + index > 64` the result is
 /// undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(extrq))]
 pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
@@ -49,7 +49,7 @@ pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
 ///
 /// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
 /// or `index > 0 && length == 0` the result is undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(insertq))]
 pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
@@ -57,7 +57,7 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
 }
 
 /// Non-temporal store of `a.0` into `p`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntsd))]
 pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
@@ -65,7 +65,7 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
 }
 
 /// Non-temporal store of `a.0` into `p`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntss))]
 pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
diff --git a/library/stdarch/coresimd/src/x86/i686/ssse3.rs b/library/stdarch/coresimd/src/x86/i686/ssse3.rs
index 647074096cdf..c386d8a0a4ad 100644
--- a/library/stdarch/coresimd/src/x86/i686/ssse3.rs
+++ b/library/stdarch/coresimd/src/x86/i686/ssse3.rs
@@ -7,7 +7,7 @@ use x86::*;
 
 /// Compute the absolute value of packed 8-bit integers in `a` and
 /// return the unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pabsb))]
 pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
@@ -16,7 +16,7 @@ pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
 
 /// Compute the absolute value of packed 8-bit integers in `a`, and return the
 /// unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pabsw))]
 pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
@@ -25,7 +25,7 @@ pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
 
 /// Compute the absolute value of packed 32-bit integers in `a`, and return the
 /// unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pabsd))]
 pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
@@ -34,7 +34,7 @@ pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
 
 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
 /// the corresponding 8-bit element of `b`, and return the results
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pshufb))]
 pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
@@ -43,7 +43,7 @@ pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 /// the result by the number of bytes specified in the immediate operand.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
 pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
@@ -57,7 +57,7 @@ pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 64-bit vectors of [4 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phaddw))]
 pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
@@ -66,7 +66,7 @@ pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 64-bit vectors of [2 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phaddd))]
 pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
@@ -76,7 +76,7 @@ pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phaddsw))]
 pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
@@ -85,7 +85,7 @@ pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
 /// packed 64-bit vectors of [4 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phsubw))]
 pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
@@ -94,7 +94,7 @@ pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
 /// packed 64-bit vectors of [2 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phsubd))]
 pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
@@ -105,7 +105,7 @@ pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
 /// packed 64-bit vectors of [4 x i16]. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phsubsw))]
 pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -117,7 +117,7 @@ pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
 /// integer values contained in the second source operand, adds pairs of
 /// contiguous products with signed saturation, and writes the 16-bit sums to
 /// the corresponding bits in the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
 pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -127,7 +127,7 @@ pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 /// products to the 18 most significant bits by right-shifting, rounds the
 /// truncated value by adding 1, and writes bits [16:1] to the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
 pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -138,7 +138,7 @@ pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b` is
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(psignb))]
 pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
@@ -149,7 +149,7 @@ pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b` is
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(psignw))]
 pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
@@ -160,7 +160,7 @@ pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b` is
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(psignd))]
 pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
diff --git a/library/stdarch/coresimd/src/x86/mod.rs b/library/stdarch/coresimd/src/x86/mod.rs
index 05e99a9b9c62..d1f42f6cc919 100644
--- a/library/stdarch/coresimd/src/x86/mod.rs
+++ b/library/stdarch/coresimd/src/x86/mod.rs
@@ -17,7 +17,7 @@ macro_rules! types {
         pub struct $name($($fields)*);
 
         impl Clone for $name {
-            #[inline(always)] // currently needed for correctness
+            #[inline] // currently needed for correctness
             fn clone(&self) -> $name {
                 *self
             }
@@ -307,49 +307,49 @@ pub use self::test::*;
 trait m128iExt: Sized {
     fn as_m128i(self) -> __m128i;
 
-    #[inline(always)]
+    #[inline]
     fn as_u8x16(self) -> ::v128::u8x16 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u16x8(self) -> ::v128::u16x8 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u32x4(self) -> ::v128::u32x4 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u64x2(self) -> ::v128::u64x2 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i8x16(self) -> ::v128::i8x16 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i16x8(self) -> ::v128::i16x8 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i32x4(self) -> ::v128::i32x4 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i64x2(self) -> ::v128::i64x2 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 }
 
 impl m128iExt for __m128i {
-    #[inline(always)]
+    #[inline]
     fn as_m128i(self) -> __m128i { self }
 }
 
@@ -358,49 +358,49 @@ impl m128iExt for __m128i {
 trait m256iExt: Sized {
     fn as_m256i(self) -> __m256i;
 
-    #[inline(always)]
+    #[inline]
     fn as_u8x32(self) -> ::v256::u8x32 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u16x16(self) -> ::v256::u16x16 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u32x8(self) -> ::v256::u32x8 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u64x4(self) -> ::v256::u64x4 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i8x32(self) -> ::v256::i8x32 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i16x16(self) -> ::v256::i16x16 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i32x8(self) -> ::v256::i32x8 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i64x4(self) -> ::v256::i64x4 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 }
 
 impl m256iExt for __m256i {
-    #[inline(always)]
+    #[inline]
     fn as_m256i(self) -> __m256i { self }
 }
 
diff --git a/library/stdarch/coresimd/src/x86/x86_64/abm.rs b/library/stdarch/coresimd/src/x86/x86_64/abm.rs
index 988950104854..235fa8bb1719 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/abm.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/abm.rs
@@ -4,7 +4,7 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
 pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
@@ -12,7 +12,7 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
 }
 
 /// Counts the bits that are set.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
 pub unsafe fn _popcnt64(x: i64) -> i32 {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/avx.rs b/library/stdarch/coresimd/src/x86/x86_64/avx.rs
index ae92ccd71158..3f9fda1451f2 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/avx.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/avx.rs
@@ -5,7 +5,7 @@ use x86::*;
 
 /// Copy `a` to result, and insert the 64-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/avx2.rs b/library/stdarch/coresimd/src/x86/x86_64/avx2.rs
index 840569e7c960..6cdb542bfb95 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/avx2.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/avx2.rs
@@ -2,7 +2,7 @@ use simd_llvm::*;
 use x86::*;
 
 /// Extract a 64-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/bmi.rs b/library/stdarch/coresimd/src/x86/x86_64/bmi.rs
index cda1daa7e710..d8d8a749723b 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/bmi.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/bmi.rs
@@ -3,7 +3,7 @@ use stdsimd_test::assert_instr;
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
@@ -16,7 +16,7 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
@@ -25,7 +25,7 @@ pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
 }
 
 /// Bitwise logical `AND` of inverted `a` with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(andn))]
 pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
@@ -33,7 +33,7 @@ pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
 }
 
 /// Extract lowest set isolated bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsi))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -42,7 +42,7 @@ pub unsafe fn _blsi_u64(x: u64) -> u64 {
 }
 
 /// Get mask up to lowest set bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -53,7 +53,7 @@ pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsr))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -64,7 +64,7 @@ pub unsafe fn _blsr_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
@@ -74,7 +74,7 @@ pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs b/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs
index 761fa5fec107..b1c74d15c888 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/bmi2.rs
@@ -5,7 +5,7 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(mulx))]
 #[target_feature(enable = "bmi2")]
 #[cfg(not(target_arch = "x86"))] // calls an intrinsic
@@ -16,7 +16,7 @@ pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
 }
 
 /// Zero higher bits of `a` >= `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
 #[cfg(not(target_arch = "x86"))]
@@ -26,7 +26,7 @@ pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
 #[cfg(not(target_arch = "x86"))]
@@ -36,7 +36,7 @@ pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
 
 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
 #[cfg(not(target_arch = "x86"))]
diff --git a/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs b/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs
index c2a7391a2b0e..d717db15dc8d 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/fxsr.rs
@@ -21,7 +21,7 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave64))]
 pub unsafe fn _fxsave64(mem_addr: *mut u8) {
@@ -42,7 +42,7 @@ pub unsafe fn _fxsave64(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor64))]
 pub unsafe fn _fxrstor64(mem_addr: *const u8) {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse.rs b/library/stdarch/coresimd/src/x86/x86_64/sse.rs
index ff7929afc998..4763e81b6f8a 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/sse.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/sse.rs
@@ -24,7 +24,7 @@ extern "C" {
 /// [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
 pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
@@ -40,7 +40,7 @@ pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
 /// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
 pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
@@ -52,7 +52,7 @@ pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
 /// input).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse2.rs b/library/stdarch/coresimd/src/x86/x86_64/sse2.rs
index d99a39115740..ff16d1f957af 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/sse2.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/sse2.rs
@@ -16,7 +16,7 @@ extern "C" {
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 64-bit integer.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
 pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
@@ -24,7 +24,7 @@ pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
 }
 
 /// Alias for [`_mm_cvtsd_si64`](fn._mm_cvtsd_si64_ss.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
 pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
@@ -33,7 +33,7 @@ pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
 
 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 64-bit integer with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
 pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
@@ -41,7 +41,7 @@ pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
 }
 
 /// Alias for [`_mm_cvttsd_si64`](fn._mm_cvttsd_si64_ss.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
 pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
@@ -51,7 +51,7 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
 /// Stores a 64-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
 pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
@@ -60,7 +60,7 @@ pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
@@ -69,7 +69,7 @@ pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
@@ -77,7 +77,7 @@ pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
 }
 
 /// Return the lowest element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
@@ -85,7 +85,7 @@ pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
 }
 
 /// Return the lowest element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
@@ -94,7 +94,7 @@ pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
@@ -103,7 +103,7 @@ pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse41.rs b/library/stdarch/coresimd/src/x86/x86_64/sse41.rs
index a7f25a4ae324..2747ad44718d 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/sse41.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/sse41.rs
@@ -9,7 +9,7 @@ use simd_llvm::*;
 use stdsimd_test::assert_instr;
 
 /// Extract an 64-bit integer from `a` selected with `imm8`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))]
@@ -20,7 +20,7 @@ pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
 
 /// Return a copy of `a` with the 64-bit integer from `i` inserted at a
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
 pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/sse42.rs b/library/stdarch/coresimd/src/x86/x86_64/sse42.rs
index 12fd87ea2b1a..6fe79ea8c01c 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/sse42.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/sse42.rs
@@ -11,7 +11,7 @@ extern "C" {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 64-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
diff --git a/library/stdarch/coresimd/src/x86/x86_64/xsave.rs b/library/stdarch/coresimd/src/x86/x86_64/xsave.rs
index fc8b38ced6d4..0ddd8b1476b4 100644
--- a/library/stdarch/coresimd/src/x86/x86_64/xsave.rs
+++ b/library/stdarch/coresimd/src/x86/x86_64/xsave.rs
@@ -29,7 +29,7 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave64))]
 pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
@@ -42,7 +42,7 @@ pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor64))]
 pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
@@ -56,7 +56,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE64` instruction.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt64))]
 pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
@@ -69,7 +69,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec64))]
 pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
@@ -83,7 +83,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves64))]
 pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
@@ -99,7 +99,7 @@ pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors64))]
 pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {