diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 10c1f2de8d20..20c61449a7b7 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -33,10 +33,12 @@ use stdarch_test::assert_instr; #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i { - let a = a.as_i32x8(); - let r = simd_select::(simd_lt(a, i32x8::ZERO), simd_neg(a), a); - transmute(r) +pub fn _mm256_abs_epi32(a: __m256i) -> __m256i { + unsafe { + let a = a.as_i32x8(); + let r = simd_select::(simd_lt(a, i32x8::ZERO), simd_neg(a), a); + transmute(r) + } } /// Computes the absolute values of packed 16-bit integers in `a`. @@ -46,10 +48,12 @@ pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i { - let a = a.as_i16x16(); - let r = simd_select::(simd_lt(a, i16x16::ZERO), simd_neg(a), a); - transmute(r) +pub fn _mm256_abs_epi16(a: __m256i) -> __m256i { + unsafe { + let a = a.as_i16x16(); + let r = simd_select::(simd_lt(a, i16x16::ZERO), simd_neg(a), a); + transmute(r) + } } /// Computes the absolute values of packed 8-bit integers in `a`. @@ -59,10 +63,12 @@ pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i { - let a = a.as_i8x32(); - let r = simd_select::(simd_lt(a, i8x32::ZERO), simd_neg(a), a); - transmute(r) +pub fn _mm256_abs_epi8(a: __m256i) -> __m256i { + unsafe { + let a = a.as_i8x32(); + let r = simd_select::(simd_lt(a, i8x32::ZERO), simd_neg(a), a); + transmute(r) + } } /// Adds packed 64-bit integers in `a` and `b`. @@ -72,8 +78,8 @@ pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_add(a.as_i64x4(), b.as_i64x4())) +pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) } } /// Adds packed 32-bit integers in `a` and `b`. @@ -83,8 +89,8 @@ pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_add(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) } } /// Adds packed 16-bit integers in `a` and `b`. @@ -94,8 +100,8 @@ pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_add(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) } } /// Adds packed 8-bit integers in `a` and `b`. @@ -105,8 +111,8 @@ pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_add(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) } } /// Adds packed 8-bit integers in `a` and `b` using saturation. @@ -116,8 +122,8 @@ pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddsb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) } } /// Adds packed 16-bit integers in `a` and `b` using saturation. @@ -127,8 +133,8 @@ pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) } } /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation. @@ -138,8 +144,8 @@ pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddusb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) +pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) } } /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation. @@ -149,8 +155,8 @@ pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddusw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) +pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) } } /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary @@ -162,160 +168,162 @@ pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i) -> __m256i { +pub fn _mm256_alignr_epi8(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - // If palignr is shifting the pair of vectors more than the size of two - // lanes, emit zero. - if IMM8 >= 32 { - return _mm256_setzero_si256(); + unsafe { + // If palignr is shifting the pair of vectors more than the size of two + // lanes, emit zero. + if IMM8 >= 32 { + return _mm256_setzero_si256(); + } + // If palignr is shifting the pair of input vectors more than one lane, + // but less than two lanes, convert to shifting in zeroes. + let (a, b) = if IMM8 > 16 { + (_mm256_setzero_si256(), a) + } else { + (a, b) + }; + + let a = a.as_i8x32(); + let b = b.as_i8x32(); + + if IMM8 == 16 { + return transmute(a); + } + + let r: i8x32 = match IMM8 % 16 { + 0 => simd_shuffle!( + b, + a, + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ], + ), + 1 => simd_shuffle!( + b, + a, + [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, + ], + ), + 2 => simd_shuffle!( + b, + a, + [ + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, + ], + ), + 3 => simd_shuffle!( + b, + a, + [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, + ], + ), + 4 => simd_shuffle!( + b, + a, + [ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, + ], + ), + 5 => simd_shuffle!( + b, + a, + [ + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, + ], + ), + 6 => simd_shuffle!( + b, + a, + [ + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, + ], + ), + 7 => simd_shuffle!( + b, + a, + [ + 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, + ], + ), + 8 => simd_shuffle!( + b, + a, + [ + 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, + 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, + ], + ), + 9 => simd_shuffle!( + b, + a, + [ + 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, + 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, + ], + ), + 10 => simd_shuffle!( + b, + a, + [ + 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, + 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + ], + ), + 11 => simd_shuffle!( + b, + a, + [ + 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, + 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + ], + ), + 12 => simd_shuffle!( + b, + a, + [ + 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + ], + ), + 13 => simd_shuffle!( + b, + a, + [ + 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + ], + ), + 14 => simd_shuffle!( + b, + a, + [ + 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + ], + ), + 15 => simd_shuffle!( + b, + a, + [ + 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + ], + ), + _ => unreachable_unchecked(), + }; + transmute(r) } - // If palignr is shifting the pair of input vectors more than one lane, - // but less than two lanes, convert to shifting in zeroes. - let (a, b) = if IMM8 > 16 { - (_mm256_setzero_si256(), a) - } else { - (a, b) - }; - - let a = a.as_i8x32(); - let b = b.as_i8x32(); - - if IMM8 == 16 { - return transmute(a); - } - - let r: i8x32 = match IMM8 % 16 { - 0 => simd_shuffle!( - b, - a, - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 26, 27, 28, 29, 30, 31, - ], - ), - 1 => simd_shuffle!( - b, - a, - [ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 48, - ], - ), - 2 => simd_shuffle!( - b, - a, - [ - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 48, 49, - ], - ), - 3 => simd_shuffle!( - b, - a, - [ - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, - ], - ), - 4 => simd_shuffle!( - b, - a, - [ - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, - ], - ), - 5 => simd_shuffle!( - b, - a, - [ - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, - ], - ), - 6 => simd_shuffle!( - b, - a, - [ - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27, - 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, - ], - ), - 7 => simd_shuffle!( - b, - a, - [ - 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27, - 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, - ], - ), - 8 => simd_shuffle!( - b, - a, - [ - 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28, - 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, - ], - ), - 9 => simd_shuffle!( - b, - a, - [ - 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29, - 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, - ], - ), - 10 => simd_shuffle!( - b, - a, - [ - 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30, - 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - ], - ), - 11 => simd_shuffle!( - b, - a, - [ - 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - ], - ), - 12 => simd_shuffle!( - b, - a, - [ - 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48, - 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - ], - ), - 13 => simd_shuffle!( - b, - a, - [ - 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49, - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - ], - ), - 14 => simd_shuffle!( - b, - a, - [ - 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50, - 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, - ], - ), - 15 => simd_shuffle!( - b, - a, - [ - 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51, - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, - ], - ), - _ => unreachable_unchecked(), - }; - transmute(r) } /// Computes the bitwise AND of 256 bits (representing integer data) @@ -326,8 +334,8 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i) -> __m #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vandps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_and(a.as_i64x4(), b.as_i64x4())) +pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) } } /// Computes the bitwise NOT of 256 bits (representing integer data) @@ -338,12 +346,14 @@ pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vandnps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { - let all_ones = _mm256_set1_epi8(-1); - transmute(simd_and( - simd_xor(a.as_i64x4(), all_ones.as_i64x4()), - b.as_i64x4(), - )) +pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let all_ones = _mm256_set1_epi8(-1); + transmute(simd_and( + simd_xor(a.as_i64x4(), all_ones.as_i64x4()), + b.as_i64x4(), + )) + } } /// Averages packed unsigned 16-bit integers in `a` and `b`. @@ -353,11 +363,13 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { - let a = simd_cast::<_, u32x16>(a.as_u16x16()); - let b = simd_cast::<_, u32x16>(b.as_u16x16()); - let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); - transmute(simd_cast::<_, u16x16>(r)) +pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); + transmute(simd_cast::<_, u16x16>(r)) + } } /// Averages packed unsigned 8-bit integers in `a` and `b`. @@ -367,11 +379,13 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { - let a = simd_cast::<_, u16x32>(a.as_u8x32()); - let b = simd_cast::<_, u16x32>(b.as_u8x32()); - let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); - transmute(simd_cast::<_, u8x32>(r)) +pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = simd_cast::<_, u16x32>(a.as_u8x32()); + let b = simd_cast::<_, u16x32>(b.as_u8x32()); + let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); + transmute(simd_cast::<_, u8x32>(r)) + } } /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`. @@ -382,21 +396,23 @@ pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i) -> __m128i { +pub fn _mm_blend_epi32(a: __m128i, b: __m128i) -> __m128i { static_assert_uimm_bits!(IMM4, 4); - let a = a.as_i32x4(); - let b = b.as_i32x4(); - let r: i32x4 = simd_shuffle!( - a, - b, - [ - [0, 4, 0, 4][IMM4 as usize & 0b11], - [1, 1, 5, 5][IMM4 as usize & 0b11], - [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11], - [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11], - ], - ); - transmute(r) + unsafe { + let a = a.as_i32x4(); + let b = b.as_i32x4(); + let r: i32x4 = simd_shuffle!( + a, + b, + [ + [0, 4, 0, 4][IMM4 as usize & 0b11], + [1, 1, 5, 5][IMM4 as usize & 0b11], + [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11], + [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11], + ], + ); + transmute(r) + } } /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`. @@ -407,25 +423,27 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i) -> __m128 #[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i) -> __m256i { +pub fn _mm256_blend_epi32(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i32x8(); - let b = b.as_i32x8(); - let r: i32x8 = simd_shuffle!( - a, - b, - [ - [0, 8, 0, 8][IMM8 as usize & 0b11], - [1, 1, 9, 9][IMM8 as usize & 0b11], - [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11], - [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11], - [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11], - [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11], - [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11], - [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11], - ], - ); - transmute(r) + unsafe { + let a = a.as_i32x8(); + let b = b.as_i32x8(); + let r: i32x8 = simd_shuffle!( + a, + b, + [ + [0, 8, 0, 8][IMM8 as usize & 0b11], + [1, 1, 9, 9][IMM8 as usize & 0b11], + [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11], + [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11], + [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11], + [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11], + [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11], + [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11], + ], + ); + transmute(r) + } } /// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`. @@ -436,34 +454,36 @@ pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i) -> __m #[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m256i { +pub fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i16x16(); - let b = b.as_i16x16(); + unsafe { + let a = a.as_i16x16(); + let b = b.as_i16x16(); - let r: i16x16 = simd_shuffle!( - a, - b, - [ - [0, 16, 0, 16][IMM8 as usize & 0b11], - [1, 1, 17, 17][IMM8 as usize & 0b11], - [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11], - [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11], - [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11], - [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11], - [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11], - [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11], - [8, 24, 8, 24][IMM8 as usize & 0b11], - [9, 9, 25, 25][IMM8 as usize & 0b11], - [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11], - [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11], - [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11], - [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11], - [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11], - [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11], - ], - ); - transmute(r) + let r: i16x16 = simd_shuffle!( + a, + b, + [ + [0, 16, 0, 16][IMM8 as usize & 0b11], + [1, 1, 17, 17][IMM8 as usize & 0b11], + [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11], + [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11], + [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11], + [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11], + [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11], + [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11], + [8, 24, 8, 24][IMM8 as usize & 0b11], + [9, 9, 25, 25][IMM8 as usize & 0b11], + [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11], + [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11], + [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11], + [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11], + [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11], + [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11], + ], + ); + transmute(r) + } } /// Blends packed 8-bit integers from `a` and `b` using `mask`. @@ -473,9 +493,11 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { - let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO); - transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32())) +pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { + unsafe { + let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO); + transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32())) + } } /// Broadcasts the low packed 8-bit integer from `a` to all elements of @@ -486,9 +508,11 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25 #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { - let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]); - transmute::(ret) +pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { + unsafe { + let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]); + transmute::(ret) + } } /// Broadcasts the low packed 8-bit integer from `a` to all elements of @@ -499,9 +523,11 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { - let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]); - transmute::(ret) +pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { + unsafe { + let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]); + transmute::(ret) + } } // N.B., `simd_shuffle4` with integer data types for `a` and `b` is @@ -514,9 +540,11 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { - let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]); - transmute::(ret) +pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { + unsafe { + let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]); + transmute::(ret) + } } // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is @@ -529,9 +557,11 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { - let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]); - transmute::(ret) +pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { + unsafe { + let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]); + transmute::(ret) + } } /// Broadcasts the low packed 64-bit integer from `a` to all elements of @@ -544,9 +574,11 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { // See https://github.com/rust-lang/stdarch/issues/791 #[cfg_attr(test, assert_instr(vmovddup))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { - let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]); - transmute::(ret) +pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { + unsafe { + let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]); + transmute::(ret) + } } /// Broadcasts the low packed 64-bit integer from `a` to all elements of @@ -557,9 +589,11 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastsd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { - let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]); - transmute::(ret) +pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { + unsafe { + let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]); + transmute::(ret) + } } /// Broadcasts the low double-precision (64-bit) floating-point element @@ -570,8 +604,8 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vmovddup))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { - simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) +pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { + unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) } } /// Broadcasts the low double-precision (64-bit) floating-point element @@ -582,8 +616,8 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastsd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { - simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) +pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { + unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) } } /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in @@ -593,9 +627,11 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { #[inline] #[target_feature(enable = "avx2")] #[stable(feature = "simd_x86_updates", since = "1.82.0")] -pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i { - let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]); - transmute::(ret) +pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i { + unsafe { + let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]); + transmute::(ret) + } } // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or @@ -607,9 +643,11 @@ pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i { #[inline] #[target_feature(enable = "avx2")] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { - let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]); - transmute::(ret) +pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { + unsafe { + let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]); + transmute::(ret) + } } /// Broadcasts the low single-precision (32-bit) floating-point element @@ -620,8 +658,8 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 { - simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) +pub fn _mm_broadcastss_ps(a: __m128) -> __m128 { + unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) } } /// Broadcasts the low single-precision (32-bit) floating-point element @@ -632,8 +670,8 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 { - simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) +pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 { + unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) } } /// Broadcasts the low packed 16-bit integer from a to all elements of @@ -644,9 +682,11 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { - let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]); - transmute::(ret) +pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { + unsafe { + let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]); + transmute::(ret) + } } /// Broadcasts the low packed 16-bit integer from a to all elements of @@ -657,9 +697,11 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { - let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]); - transmute::(ret) +pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { + unsafe { + let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]); + transmute::(ret) + } } /// Compares packed 64-bit integers in `a` and `b` for equality. @@ -669,8 +711,8 @@ pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_eq(a.as_i64x4(), b.as_i64x4())) +pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_eq(a.as_i64x4(), b.as_i64x4())) } } /// Compares packed 32-bit integers in `a` and `b` for equality. @@ -680,8 +722,8 @@ pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_eq(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_eq(a.as_i32x8(), b.as_i32x8())) } } /// Compares packed 16-bit integers in `a` and `b` for equality. @@ -691,8 +733,8 @@ pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_eq(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_eq(a.as_i16x16(), b.as_i16x16())) } } /// Compares packed 8-bit integers in `a` and `b` for equality. @@ -702,8 +744,8 @@ pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_eq(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_eq(a.as_i8x32(), b.as_i8x32())) } } /// Compares packed 64-bit integers in `a` and `b` for greater-than. @@ -713,8 +755,8 @@ pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_gt(a.as_i64x4(), b.as_i64x4())) +pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_gt(a.as_i64x4(), b.as_i64x4())) } } /// Compares packed 32-bit integers in `a` and `b` for greater-than. @@ -724,8 +766,8 @@ pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_gt(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_gt(a.as_i32x8(), b.as_i32x8())) } } /// Compares packed 16-bit integers in `a` and `b` for greater-than. @@ -735,8 +777,8 @@ pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_gt(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_gt(a.as_i16x16(), b.as_i16x16())) } } /// Compares packed 8-bit integers in `a` and `b` for greater-than. @@ -746,8 +788,8 @@ pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute::(simd_gt(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute::(simd_gt(a.as_i8x32(), b.as_i8x32())) } } /// Sign-extend 16-bit integers to 32-bit integers. @@ -757,8 +799,8 @@ pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { - transmute::(simd_cast(a.as_i16x8())) +pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { + unsafe { transmute::(simd_cast(a.as_i16x8())) } } /// Sign-extend 16-bit integers to 64-bit integers. @@ -768,10 +810,12 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxwq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { - let a = a.as_i16x8(); - let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); - transmute::(simd_cast(v64)) +pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { + unsafe { + let a = a.as_i16x8(); + let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); + transmute::(simd_cast(v64)) + } } /// Sign-extend 32-bit integers to 64-bit integers. @@ -781,8 +825,8 @@ pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxdq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { - transmute::(simd_cast(a.as_i32x4())) +pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { + unsafe { transmute::(simd_cast(a.as_i32x4())) } } /// Sign-extend 8-bit integers to 16-bit integers. @@ -792,8 +836,8 @@ pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { - transmute::(simd_cast(a.as_i8x16())) +pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { + unsafe { transmute::(simd_cast(a.as_i8x16())) } } /// Sign-extend 8-bit integers to 32-bit integers. @@ -803,10 +847,12 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { - let a = a.as_i8x16(); - let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); - transmute::(simd_cast(v64)) +pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { + unsafe { + let a = a.as_i8x16(); + let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + transmute::(simd_cast(v64)) + } } /// Sign-extend 8-bit integers to 64-bit integers. @@ -816,10 +862,12 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { - let a = a.as_i8x16(); - let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); - transmute::(simd_cast(v32)) +pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { + unsafe { + let a = a.as_i8x16(); + let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); + transmute::(simd_cast(v32)) + } } /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit @@ -830,8 +878,8 @@ pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { - transmute::(simd_cast(a.as_u16x8())) +pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { + unsafe { transmute::(simd_cast(a.as_u16x8())) } } /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit @@ -842,10 +890,12 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxwq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { - let a = a.as_u16x8(); - let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); - transmute::(simd_cast(v64)) +pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { + unsafe { + let a = a.as_u16x8(); + let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); + transmute::(simd_cast(v64)) + } } /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers. @@ -855,8 +905,8 @@ pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxdq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { - transmute::(simd_cast(a.as_u32x4())) +pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { + unsafe { transmute::(simd_cast(a.as_u32x4())) } } /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers. @@ -866,8 +916,8 @@ pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { - transmute::(simd_cast(a.as_u8x16())) +pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { + unsafe { transmute::(simd_cast(a.as_u8x16())) } } /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit @@ -878,10 +928,12 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { - let a = a.as_u8x16(); - let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); - transmute::(simd_cast(v64)) +pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { + unsafe { + let a = a.as_u8x16(); + let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + transmute::(simd_cast(v64)) + } } /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit @@ -892,10 +944,12 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { - let a = a.as_u8x16(); - let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); - transmute::(simd_cast(v32)) +pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { + unsafe { + let a = a.as_u8x16(); + let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); + transmute::(simd_cast(v32)) + } } /// Extracts 128 bits (of integer data) from `a` selected with `IMM1`. @@ -909,12 +963,14 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { )] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_extracti128_si256(a: __m256i) -> __m128i { +pub fn _mm256_extracti128_si256(a: __m256i) -> __m128i { static_assert_uimm_bits!(IMM1, 1); - let a = a.as_i64x4(); - let b = i64x4::ZERO; - let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]); - transmute(dst) + unsafe { + let a = a.as_i64x4(); + let b = i64x4::ZERO; + let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]); + transmute(dst) + } } /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`. @@ -924,8 +980,8 @@ pub unsafe fn _mm256_extracti128_si256(a: __m256i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(phaddw(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) } } /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`. @@ -935,8 +991,8 @@ pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(phaddd(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) } } /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b` @@ -947,8 +1003,8 @@ pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) } } /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`. @@ -958,8 +1014,8 @@ pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(phsubw(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) } } /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`. @@ -969,8 +1025,8 @@ pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(phsubd(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) } } /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` @@ -981,8 +1037,8 @@ pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) } } /// Returns values from `slice` at offsets determined by `offsets * scale`, @@ -1731,12 +1787,14 @@ pub unsafe fn _mm256_mask_i64gather_pd( )] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> __m256i { +pub fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> __m256i { static_assert_uimm_bits!(IMM1, 1); - let a = a.as_i64x4(); - let b = _mm256_castsi128_si256(b).as_i64x4(); - let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]); - transmute(dst) + unsafe { + let a = a.as_i64x4(); + let b = _mm256_castsi128_si256(b).as_i64x4(); + let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]); + transmute(dst) + } } /// Multiplies packed signed 16-bit integers in `a` and `b`, producing @@ -1748,8 +1806,8 @@ pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i) - #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) } } /// Vertically multiplies each unsigned 8-bit integer from `a` with the @@ -1762,8 +1820,8 @@ pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) +pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) } } /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask` @@ -1878,10 +1936,12 @@ pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m25 #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_i16x16(); - let b = b.as_i16x16(); - transmute(simd_select::(simd_gt(a, b), a, b)) +pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_i16x16(); + let b = b.as_i16x16(); + transmute(simd_select::(simd_gt(a, b), a, b)) + } } /// Compares packed 32-bit integers in `a` and `b`, and returns the packed @@ -1892,10 +1952,12 @@ pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_i32x8(); - let b = b.as_i32x8(); - transmute(simd_select::(simd_gt(a, b), a, b)) +pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_i32x8(); + let b = b.as_i32x8(); + transmute(simd_select::(simd_gt(a, b), a, b)) + } } /// Compares packed 8-bit integers in `a` and `b`, and returns the packed @@ -1906,10 +1968,12 @@ pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_i8x32(); - let b = b.as_i8x32(); - transmute(simd_select::(simd_gt(a, b), a, b)) +pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_i8x32(); + let b = b.as_i8x32(); + transmute(simd_select::(simd_gt(a, b), a, b)) + } } /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns @@ -1920,10 +1984,12 @@ pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxuw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u16x16(); - let b = b.as_u16x16(); - transmute(simd_select::(simd_gt(a, b), a, b)) +pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u16x16(); + let b = b.as_u16x16(); + transmute(simd_select::(simd_gt(a, b), a, b)) + } } /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns @@ -1934,10 +2000,12 @@ pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxud))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u32x8(); - let b = b.as_u32x8(); - transmute(simd_select::(simd_gt(a, b), a, b)) +pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u32x8(); + let b = b.as_u32x8(); + transmute(simd_select::(simd_gt(a, b), a, b)) + } } /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns @@ -1948,10 +2016,12 @@ pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxub))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u8x32(); - let b = b.as_u8x32(); - transmute(simd_select::(simd_gt(a, b), a, b)) +pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u8x32(); + let b = b.as_u8x32(); + transmute(simd_select::(simd_gt(a, b), a, b)) + } } /// Compares packed 16-bit integers in `a` and `b`, and returns the packed @@ -1962,10 +2032,12 @@ pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_i16x16(); - let b = b.as_i16x16(); - transmute(simd_select::(simd_lt(a, b), a, b)) +pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_i16x16(); + let b = b.as_i16x16(); + transmute(simd_select::(simd_lt(a, b), a, b)) + } } /// Compares packed 32-bit integers in `a` and `b`, and returns the packed @@ -1976,10 +2048,12 @@ pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_i32x8(); - let b = b.as_i32x8(); - transmute(simd_select::(simd_lt(a, b), a, b)) +pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_i32x8(); + let b = b.as_i32x8(); + transmute(simd_select::(simd_lt(a, b), a, b)) + } } /// Compares packed 8-bit integers in `a` and `b`, and returns the packed @@ -1990,10 +2064,12 @@ pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_i8x32(); - let b = b.as_i8x32(); - transmute(simd_select::(simd_lt(a, b), a, b)) +pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_i8x32(); + let b = b.as_i8x32(); + transmute(simd_select::(simd_lt(a, b), a, b)) + } } /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns @@ -2004,10 +2080,12 @@ pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminuw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u16x16(); - let b = b.as_u16x16(); - transmute(simd_select::(simd_lt(a, b), a, b)) +pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u16x16(); + let b = b.as_u16x16(); + transmute(simd_select::(simd_lt(a, b), a, b)) + } } /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns @@ -2018,10 +2096,12 @@ pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminud))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u32x8(); - let b = b.as_u32x8(); - transmute(simd_select::(simd_lt(a, b), a, b)) +pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u32x8(); + let b = b.as_u32x8(); + transmute(simd_select::(simd_lt(a, b), a, b)) + } } /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns @@ -2032,10 +2112,12 @@ pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminub))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u8x32(); - let b = b.as_u8x32(); - transmute(simd_select::(simd_lt(a, b), a, b)) +pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u8x32(); + let b = b.as_u8x32(); + transmute(simd_select::(simd_lt(a, b), a, b)) + } } /// Creates mask from the most significant bit of each 8-bit element in `a`, @@ -2046,10 +2128,12 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovmskb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { - let z = i8x32::ZERO; - let m: i8x32 = simd_lt(a.as_i8x32(), z); - simd_bitmask::<_, u32>(m) as i32 +pub fn _mm256_movemask_epi8(a: __m256i) -> i32 { + unsafe { + let z = i8x32::ZERO; + let m: i8x32 = simd_lt(a.as_i8x32(), z); + simd_bitmask::<_, u32>(m) as i32 + } } /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned @@ -2066,9 +2150,9 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { #[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __m256i { +pub fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8)) + unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8)) } } /// Multiplies the low 32-bit integers from each packed 64-bit element in @@ -2081,10 +2165,12 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __ #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { - let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4())); - let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4())); - transmute(simd_mul(a, b)) +pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4())); + let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4())); + transmute(simd_mul(a, b)) + } } /// Multiplies the low unsigned 32-bit integers from each packed 64-bit @@ -2097,11 +2183,13 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { - let a = a.as_u64x4(); - let b = b.as_u64x4(); - let mask = u64x4::splat(u32::MAX.into()); - transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) +pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let mask = u64x4::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) + } } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -2113,11 +2201,13 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { - let a = simd_cast::<_, i32x16>(a.as_i16x16()); - let b = simd_cast::<_, i32x16>(b.as_i16x16()); - let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); - transmute(simd_cast::(r)) +pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = simd_cast::<_, i32x16>(a.as_i16x16()); + let b = simd_cast::<_, i32x16>(b.as_i16x16()); + let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); + transmute(simd_cast::(r)) + } } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing @@ -2129,11 +2219,13 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { - let a = simd_cast::<_, u32x16>(a.as_u16x16()); - let b = simd_cast::<_, u32x16>(b.as_u16x16()); - let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); - transmute(simd_cast::(r)) +pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); + transmute(simd_cast::(r)) + } } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -2145,8 +2237,8 @@ pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmullw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) } } /// Multiplies the packed 32-bit integers in `a` and `b`, producing @@ -2158,8 +2250,8 @@ pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulld))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) } } /// Multiplies packed 16-bit integers in `a` and `b`, producing @@ -2172,8 +2264,8 @@ pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhrsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) } } /// Computes the bitwise OR of 256 bits (representing integer data) in `a` @@ -2184,8 +2276,8 @@ pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vorps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_or(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) } } /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers @@ -2196,8 +2288,8 @@ pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpacksswb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(packsswb(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) } } /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -2208,8 +2300,8 @@ pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackssdw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(packssdw(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) } } /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers @@ -2220,8 +2312,8 @@ pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackuswb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(packuswb(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) } } /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -2232,8 +2324,8 @@ pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackusdw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(packusdw(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) } } /// Permutes packed 32-bit integers from `a` according to the content of `b`. @@ -2246,8 +2338,8 @@ pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(permd(a.as_u32x8(), b.as_u32x8())) +pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) } } /// Permutes 64-bit integers from `a` using control mask `imm8`. @@ -2258,20 +2350,22 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permute4x64_epi64(a: __m256i) -> __m256i { +pub fn _mm256_permute4x64_epi64(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - let zero = i64x4::ZERO; - let r: i64x4 = simd_shuffle!( - a.as_i64x4(), - zero, - [ - IMM8 as u32 & 0b11, - (IMM8 as u32 >> 2) & 0b11, - (IMM8 as u32 >> 4) & 0b11, - (IMM8 as u32 >> 6) & 0b11, - ], - ); - transmute(r) + unsafe { + let zero = i64x4::ZERO; + let r: i64x4 = simd_shuffle!( + a.as_i64x4(), + zero, + [ + IMM8 as u32 & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ], + ); + transmute(r) + } } /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`. @@ -2282,9 +2376,9 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i) -> __m256i { +pub fn _mm256_permute2x128_si256(a: __m256i, b: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) + unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) } } /// Shuffles 64-bit floating-point elements in `a` across lanes using the @@ -2296,18 +2390,20 @@ pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i) #[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permute4x64_pd(a: __m256d) -> __m256d { +pub fn _mm256_permute4x64_pd(a: __m256d) -> __m256d { static_assert_uimm_bits!(IMM8, 8); - simd_shuffle!( - a, - _mm256_undefined_pd(), - [ - IMM8 as u32 & 0b11, - (IMM8 as u32 >> 2) & 0b11, - (IMM8 as u32 >> 4) & 0b11, - (IMM8 as u32 >> 6) & 0b11, - ], - ) + unsafe { + simd_shuffle!( + a, + _mm256_undefined_pd(), + [ + IMM8 as u32 & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ], + ) + } } /// Shuffles eight 32-bit floating-point elements in `a` across lanes using @@ -2318,8 +2414,8 @@ pub unsafe fn _mm256_permute4x64_pd(a: __m256d) -> __m256d { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { - permps(a, idx.as_i32x8()) +pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { + unsafe { permps(a, idx.as_i32x8()) } } /// Computes the absolute differences of packed unsigned 8-bit integers in `a` @@ -2332,8 +2428,8 @@ pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsadbw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(psadbw(a.as_u8x32(), b.as_u8x32())) +pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) } } /// Shuffles bytes from `a` according to the content of `b`. @@ -2370,8 +2466,8 @@ pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(pshufb(a.as_u8x32(), b.as_u8x32())) +pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) } } /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in @@ -2409,23 +2505,25 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vshufps, MASK = 9))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_shuffle_epi32(a: __m256i) -> __m256i { +pub fn _mm256_shuffle_epi32(a: __m256i) -> __m256i { static_assert_uimm_bits!(MASK, 8); - let r: i32x8 = simd_shuffle!( - a.as_i32x8(), - a.as_i32x8(), - [ - MASK as u32 & 0b11, - (MASK as u32 >> 2) & 0b11, - (MASK as u32 >> 4) & 0b11, - (MASK as u32 >> 6) & 0b11, - (MASK as u32 & 0b11) + 4, - ((MASK as u32 >> 2) & 0b11) + 4, - ((MASK as u32 >> 4) & 0b11) + 4, - ((MASK as u32 >> 6) & 0b11) + 4, - ], - ); - transmute(r) + unsafe { + let r: i32x8 = simd_shuffle!( + a.as_i32x8(), + a.as_i32x8(), + [ + MASK as u32 & 0b11, + (MASK as u32 >> 2) & 0b11, + (MASK as u32 >> 4) & 0b11, + (MASK as u32 >> 6) & 0b11, + (MASK as u32 & 0b11) + 4, + ((MASK as u32 >> 2) & 0b11) + 4, + ((MASK as u32 >> 4) & 0b11) + 4, + ((MASK as u32 >> 6) & 0b11) + 4, + ], + ); + transmute(r) + } } /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using @@ -2438,32 +2536,34 @@ pub unsafe fn _mm256_shuffle_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_shufflehi_epi16(a: __m256i) -> __m256i { +pub fn _mm256_shufflehi_epi16(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i16x16(); - let r: i16x16 = simd_shuffle!( - a, - a, - [ - 0, - 1, - 2, - 3, - 4 + (IMM8 as u32 & 0b11), - 4 + ((IMM8 as u32 >> 2) & 0b11), - 4 + ((IMM8 as u32 >> 4) & 0b11), - 4 + ((IMM8 as u32 >> 6) & 0b11), - 8, - 9, - 10, - 11, - 12 + (IMM8 as u32 & 0b11), - 12 + ((IMM8 as u32 >> 2) & 0b11), - 12 + ((IMM8 as u32 >> 4) & 0b11), - 12 + ((IMM8 as u32 >> 6) & 0b11), - ], - ); - transmute(r) + unsafe { + let a = a.as_i16x16(); + let r: i16x16 = simd_shuffle!( + a, + a, + [ + 0, + 1, + 2, + 3, + 4 + (IMM8 as u32 & 0b11), + 4 + ((IMM8 as u32 >> 2) & 0b11), + 4 + ((IMM8 as u32 >> 4) & 0b11), + 4 + ((IMM8 as u32 >> 6) & 0b11), + 8, + 9, + 10, + 11, + 12 + (IMM8 as u32 & 0b11), + 12 + ((IMM8 as u32 >> 2) & 0b11), + 12 + ((IMM8 as u32 >> 4) & 0b11), + 12 + ((IMM8 as u32 >> 6) & 0b11), + ], + ); + transmute(r) + } } /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using @@ -2476,32 +2576,34 @@ pub unsafe fn _mm256_shufflehi_epi16(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_shufflelo_epi16(a: __m256i) -> __m256i { +pub fn _mm256_shufflelo_epi16(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i16x16(); - let r: i16x16 = simd_shuffle!( - a, - a, - [ - 0 + (IMM8 as u32 & 0b11), - 0 + ((IMM8 as u32 >> 2) & 0b11), - 0 + ((IMM8 as u32 >> 4) & 0b11), - 0 + ((IMM8 as u32 >> 6) & 0b11), - 4, - 5, - 6, - 7, - 8 + (IMM8 as u32 & 0b11), - 8 + ((IMM8 as u32 >> 2) & 0b11), - 8 + ((IMM8 as u32 >> 4) & 0b11), - 8 + ((IMM8 as u32 >> 6) & 0b11), - 12, - 13, - 14, - 15, - ], - ); - transmute(r) + unsafe { + let a = a.as_i16x16(); + let r: i16x16 = simd_shuffle!( + a, + a, + [ + 0 + (IMM8 as u32 & 0b11), + 0 + ((IMM8 as u32 >> 2) & 0b11), + 0 + ((IMM8 as u32 >> 4) & 0b11), + 0 + ((IMM8 as u32 >> 6) & 0b11), + 4, + 5, + 6, + 7, + 8 + (IMM8 as u32 & 0b11), + 8 + ((IMM8 as u32 >> 2) & 0b11), + 8 + ((IMM8 as u32 >> 4) & 0b11), + 8 + ((IMM8 as u32 >> 6) & 0b11), + 12, + 13, + 14, + 15, + ], + ); + transmute(r) + } } /// Negates packed 16-bit integers in `a` when the corresponding signed @@ -2513,8 +2615,8 @@ pub unsafe fn _mm256_shufflelo_epi16(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(psignw(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) } } /// Negates packed 32-bit integers in `a` when the corresponding signed @@ -2526,8 +2628,8 @@ pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(psignd(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) } } /// Negates packed 8-bit integers in `a` when the corresponding signed @@ -2539,8 +2641,8 @@ pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(psignb(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) } } /// Shifts packed 16-bit integers in `a` left by `count` while @@ -2551,8 +2653,8 @@ pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { - transmute(psllw(a.as_i16x16(), count.as_i16x8())) +pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) } } /// Shifts packed 32-bit integers in `a` left by `count` while @@ -2563,8 +2665,8 @@ pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslld))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { - transmute(pslld(a.as_i32x8(), count.as_i32x4())) +pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) } } /// Shifts packed 64-bit integers in `a` left by `count` while @@ -2575,8 +2677,8 @@ pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { - transmute(psllq(a.as_i64x4(), count.as_i64x2())) +pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) } } /// Shifts packed 16-bit integers in `a` left by `IMM8` while @@ -2588,12 +2690,14 @@ pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_slli_epi16(a: __m256i) -> __m256i { +pub fn _mm256_slli_epi16(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - if IMM8 >= 16 { - _mm256_setzero_si256() - } else { - transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))) + unsafe { + if IMM8 >= 16 { + _mm256_setzero_si256() + } else { + transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))) + } } } @@ -2606,12 +2710,14 @@ pub unsafe fn _mm256_slli_epi16(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_slli_epi32(a: __m256i) -> __m256i { - static_assert_uimm_bits!(IMM8, 8); - if IMM8 >= 32 { - _mm256_setzero_si256() - } else { - transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32))) +pub fn _mm256_slli_epi32(a: __m256i) -> __m256i { + unsafe { + static_assert_uimm_bits!(IMM8, 8); + if IMM8 >= 32 { + _mm256_setzero_si256() + } else { + transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32))) + } } } @@ -2624,12 +2730,14 @@ pub unsafe fn _mm256_slli_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_slli_epi64(a: __m256i) -> __m256i { - static_assert_uimm_bits!(IMM8, 8); - if IMM8 >= 64 { - _mm256_setzero_si256() - } else { - transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))) +pub fn _mm256_slli_epi64(a: __m256i) -> __m256i { + unsafe { + static_assert_uimm_bits!(IMM8, 8); + if IMM8 >= 64 { + _mm256_setzero_si256() + } else { + transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))) + } } } @@ -2641,7 +2749,7 @@ pub unsafe fn _mm256_slli_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_slli_si256(a: __m256i) -> __m256i { +pub fn _mm256_slli_si256(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); _mm256_bslli_epi128::(a) } @@ -2654,7 +2762,7 @@ pub unsafe fn _mm256_slli_si256(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_bslli_epi128(a: __m256i) -> __m256i { +pub fn _mm256_bslli_epi128(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); const fn mask(shift: i32, i: u32) -> u32 { let shift = shift as u32 & 0xff; @@ -2664,46 +2772,48 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i) -> __m256i { 32 + (i - shift) } } - let a = a.as_i8x32(); - let r: i8x32 = simd_shuffle!( - i8x32::ZERO, - a, - [ - mask(IMM8, 0), - mask(IMM8, 1), - mask(IMM8, 2), - mask(IMM8, 3), - mask(IMM8, 4), - mask(IMM8, 5), - mask(IMM8, 6), - mask(IMM8, 7), - mask(IMM8, 8), - mask(IMM8, 9), - mask(IMM8, 10), - mask(IMM8, 11), - mask(IMM8, 12), - mask(IMM8, 13), - mask(IMM8, 14), - mask(IMM8, 15), - mask(IMM8, 16), - mask(IMM8, 17), - mask(IMM8, 18), - mask(IMM8, 19), - mask(IMM8, 20), - mask(IMM8, 21), - mask(IMM8, 22), - mask(IMM8, 23), - mask(IMM8, 24), - mask(IMM8, 25), - mask(IMM8, 26), - mask(IMM8, 27), - mask(IMM8, 28), - mask(IMM8, 29), - mask(IMM8, 30), - mask(IMM8, 31), - ], - ); - transmute(r) + unsafe { + let a = a.as_i8x32(); + let r: i8x32 = simd_shuffle!( + i8x32::ZERO, + a, + [ + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + mask(IMM8, 16), + mask(IMM8, 17), + mask(IMM8, 18), + mask(IMM8, 19), + mask(IMM8, 20), + mask(IMM8, 21), + mask(IMM8, 22), + mask(IMM8, 23), + mask(IMM8, 24), + mask(IMM8, 25), + mask(IMM8, 26), + mask(IMM8, 27), + mask(IMM8, 28), + mask(IMM8, 29), + mask(IMM8, 30), + mask(IMM8, 31), + ], + ); + transmute(r) + } } /// Shifts packed 32-bit integers in `a` left by the amount @@ -2715,8 +2825,8 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { - transmute(psllvd(a.as_i32x4(), count.as_i32x4())) +pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { + unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) } } /// Shifts packed 32-bit integers in `a` left by the amount @@ -2728,8 +2838,8 @@ pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { - transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) +pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { + unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2741,8 +2851,8 @@ pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { - transmute(psllvq(a.as_i64x2(), count.as_i64x2())) +pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { + unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2754,8 +2864,8 @@ pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { - transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) +pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { + unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) } } /// Shifts packed 16-bit integers in `a` right by `count` while @@ -2766,8 +2876,8 @@ pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsraw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { - transmute(psraw(a.as_i16x16(), count.as_i16x8())) +pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) } } /// Shifts packed 32-bit integers in `a` right by `count` while @@ -2778,8 +2888,8 @@ pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrad))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { - transmute(psrad(a.as_i32x8(), count.as_i32x4())) +pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) } } /// Shifts packed 16-bit integers in `a` right by `IMM8` while @@ -2791,9 +2901,9 @@ pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srai_epi16(a: __m256i) -> __m256i { +pub fn _mm256_srai_epi16(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) + unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) } } /// Shifts packed 32-bit integers in `a` right by `IMM8` while @@ -2805,9 +2915,9 @@ pub unsafe fn _mm256_srai_epi16(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srai_epi32(a: __m256i) -> __m256i { +pub fn _mm256_srai_epi32(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) + unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) } } /// Shifts packed 32-bit integers in `a` right by the amount specified by the @@ -2818,8 +2928,8 @@ pub unsafe fn _mm256_srai_epi32(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { - transmute(psravd(a.as_i32x4(), count.as_i32x4())) +pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { + unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) } } /// Shifts packed 32-bit integers in `a` right by the amount specified by the @@ -2830,8 +2940,8 @@ pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { - transmute(psravd256(a.as_i32x8(), count.as_i32x8())) +pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { + unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) } } /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -2842,7 +2952,7 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srli_si256(a: __m256i) -> __m256i { +pub fn _mm256_srli_si256(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); _mm256_bsrli_epi128::(a) } @@ -2855,142 +2965,144 @@ pub unsafe fn _mm256_srli_si256(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_bsrli_epi128(a: __m256i) -> __m256i { +pub fn _mm256_bsrli_epi128(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - let a = a.as_i8x32(); - let zero = i8x32::ZERO; - let r: i8x32 = match IMM8 % 16 { - 0 => simd_shuffle!( - a, - zero, - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 26, 27, 28, 29, 30, 31, - ], - ), - 1 => simd_shuffle!( - a, - zero, - [ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, - ], - ), - 2 => simd_shuffle!( - a, - zero, - [ - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, 32, - ], - ), - 3 => simd_shuffle!( - a, - zero, - [ - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, 32, 32, - ], - ), - 4 => simd_shuffle!( - a, - zero, - [ - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 32, 32, 32, - ], - ), - 5 => simd_shuffle!( - a, - zero, - [ - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31, 32, 32, 32, 32, 32, - ], - ), - 6 => simd_shuffle!( - a, - zero, - [ - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27, - 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, - ], - ), - 7 => simd_shuffle!( - a, - zero, - [ - 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27, - 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 8 => simd_shuffle!( - a, - zero, - [ - 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28, - 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 9 => simd_shuffle!( - a, - zero, - [ - 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29, - 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 10 => simd_shuffle!( - a, - zero, - [ - 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30, - 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 11 => simd_shuffle!( - a, - zero, - [ - 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 12 => simd_shuffle!( - a, - zero, - [ - 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 13 => simd_shuffle!( - a, - zero, - [ - 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 14 => simd_shuffle!( - a, - zero, - [ - 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - 15 => simd_shuffle!( - a, - zero, - [ - 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - ], - ), - _ => zero, - }; - transmute(r) + unsafe { + let a = a.as_i8x32(); + let zero = i8x32::ZERO; + let r: i8x32 = match IMM8 % 16 { + 0 => simd_shuffle!( + a, + zero, + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ], + ), + 1 => simd_shuffle!( + a, + zero, + [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ], + ), + 2 => simd_shuffle!( + a, + zero, + [ + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 32, + ], + ), + 3 => simd_shuffle!( + a, + zero, + [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 32, 32, + ], + ), + 4 => simd_shuffle!( + a, + zero, + [ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 32, 32, 32, + ], + ), + 5 => simd_shuffle!( + a, + zero, + [ + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 32, 32, 32, 32, + ], + ), + 6 => simd_shuffle!( + a, + zero, + [ + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, + ], + ), + 7 => simd_shuffle!( + a, + zero, + [ + 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 8 => simd_shuffle!( + a, + zero, + [ + 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 9 => simd_shuffle!( + a, + zero, + [ + 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, + 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 10 => simd_shuffle!( + a, + zero, + [ + 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, + 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 11 => simd_shuffle!( + a, + zero, + [ + 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 12 => simd_shuffle!( + a, + zero, + [ + 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 13 => simd_shuffle!( + a, + zero, + [ + 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 14 => simd_shuffle!( + a, + zero, + [ + 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 15 => simd_shuffle!( + a, + zero, + [ + 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + _ => zero, + }; + transmute(r) + } } /// Shifts packed 16-bit integers in `a` right by `count` while shifting in @@ -3001,8 +3113,8 @@ pub unsafe fn _mm256_bsrli_epi128(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { - transmute(psrlw(a.as_i16x16(), count.as_i16x8())) +pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) } } /// Shifts packed 32-bit integers in `a` right by `count` while shifting in @@ -3013,8 +3125,8 @@ pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrld))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { - transmute(psrld(a.as_i32x8(), count.as_i32x4())) +pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) } } /// Shifts packed 64-bit integers in `a` right by `count` while shifting in @@ -3025,8 +3137,8 @@ pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { - transmute(psrlq(a.as_i64x4(), count.as_i64x2())) +pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { + unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) } } /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in @@ -3038,12 +3150,14 @@ pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srli_epi16(a: __m256i) -> __m256i { +pub fn _mm256_srli_epi16(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - if IMM8 >= 16 { - _mm256_setzero_si256() - } else { - transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16))) + unsafe { + if IMM8 >= 16 { + _mm256_setzero_si256() + } else { + transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16))) + } } } @@ -3056,12 +3170,14 @@ pub unsafe fn _mm256_srli_epi16(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srli_epi32(a: __m256i) -> __m256i { +pub fn _mm256_srli_epi32(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - if IMM8 >= 32 { - _mm256_setzero_si256() - } else { - transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32))) + unsafe { + if IMM8 >= 32 { + _mm256_setzero_si256() + } else { + transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32))) + } } } @@ -3074,12 +3190,14 @@ pub unsafe fn _mm256_srli_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srli_epi64(a: __m256i) -> __m256i { +pub fn _mm256_srli_epi64(a: __m256i) -> __m256i { static_assert_uimm_bits!(IMM8, 8); - if IMM8 >= 64 { - _mm256_setzero_si256() - } else { - transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))) + unsafe { + if IMM8 >= 64 { + _mm256_setzero_si256() + } else { + transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))) + } } } @@ -3091,8 +3209,8 @@ pub unsafe fn _mm256_srli_epi64(a: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { - transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) +pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { + unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) } } /// Shifts packed 32-bit integers in `a` right by the amount specified by @@ -3103,8 +3221,8 @@ pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { - transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) +pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { + unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3115,8 +3233,8 @@ pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { - transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) +pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { + unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3127,8 +3245,8 @@ pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { - transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) +pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { + unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) } } /// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr @@ -3158,8 +3276,8 @@ pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) } } /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a` @@ -3169,8 +3287,8 @@ pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) +pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) } } /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a` @@ -3180,8 +3298,8 @@ pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubq))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) +pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) } } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a` @@ -3191,8 +3309,8 @@ pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) } } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in @@ -3203,8 +3321,8 @@ pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) +pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) } } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in @@ -3215,8 +3333,8 @@ pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubsb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) +pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) } } /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit @@ -3227,8 +3345,8 @@ pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubusw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) +pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) } } /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit @@ -3239,8 +3357,8 @@ pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubusb))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) +pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) } } /// Unpacks and interleave 8-bit integers from the high half of each @@ -3286,15 +3404,17 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhbw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { - #[rustfmt::skip] - let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [ - 8, 40, 9, 41, 10, 42, 11, 43, - 12, 44, 13, 45, 14, 46, 15, 47, - 24, 56, 25, 57, 26, 58, 27, 59, - 28, 60, 29, 61, 30, 62, 31, 63, - ]); - transmute(r) +pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + #[rustfmt::skip] + let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [ + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47, + 24, 56, 25, 57, 26, 58, 27, 59, + 28, 60, 29, 61, 30, 62, 31, 63, + ]); + transmute(r) + } } /// Unpacks and interleave 8-bit integers from the low half of each @@ -3339,15 +3459,17 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklbw))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { - #[rustfmt::skip] - let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [ - 0, 32, 1, 33, 2, 34, 3, 35, - 4, 36, 5, 37, 6, 38, 7, 39, - 16, 48, 17, 49, 18, 50, 19, 51, - 20, 52, 21, 53, 22, 54, 23, 55, - ]); - transmute(r) +pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { + unsafe { + #[rustfmt::skip] + let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 16, 48, 17, 49, 18, 50, 19, 51, + 20, 52, 21, 53, 22, 54, 23, 55, + ]); + transmute(r) + } } /// Unpacks and interleave 16-bit integers from the high half of each @@ -3388,13 +3510,15 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { - let r: i16x16 = simd_shuffle!( - a.as_i16x16(), - b.as_i16x16(), - [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31], - ); - transmute(r) +pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i16x16 = simd_shuffle!( + a.as_i16x16(), + b.as_i16x16(), + [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31], + ); + transmute(r) + } } /// Unpacks and interleave 16-bit integers from the low half of each @@ -3436,13 +3560,15 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { - let r: i16x16 = simd_shuffle!( - a.as_i16x16(), - b.as_i16x16(), - [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27], - ); - transmute(r) +pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i16x16 = simd_shuffle!( + a.as_i16x16(), + b.as_i16x16(), + [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27], + ); + transmute(r) + } } /// Unpacks and interleave 32-bit integers from the high half of each @@ -3477,9 +3603,11 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vunpckhps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { - let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]); - transmute(r) +pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]); + transmute(r) + } } /// Unpacks and interleave 32-bit integers from the low half of each @@ -3514,9 +3642,11 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vunpcklps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { - let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); - transmute(r) +pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); + transmute(r) + } } /// Unpacks and interleave 64-bit integers from the high half of each @@ -3551,9 +3681,11 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vunpckhpd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { - let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]); - transmute(r) +pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]); + transmute(r) + } } /// Unpacks and interleave 64-bit integers from the low half of each @@ -3588,9 +3720,11 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vunpcklpd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { - let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]); - transmute(r) +pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]); + transmute(r) + } } /// Computes the bitwise XOR of 256 bits (representing integer data) @@ -3601,8 +3735,8 @@ pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vxorps))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { - transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) +pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) } } /// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit @@ -3616,9 +3750,9 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { // This intrinsic has no corresponding instruction. #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_extract_epi8(a: __m256i) -> i32 { +pub fn _mm256_extract_epi8(a: __m256i) -> i32 { static_assert_uimm_bits!(INDEX, 5); - simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 + unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 } } /// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit @@ -3632,9 +3766,9 @@ pub unsafe fn _mm256_extract_epi8(a: __m256i) -> i32 { // This intrinsic has no corresponding instruction. #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_extract_epi16(a: __m256i) -> i32 { +pub fn _mm256_extract_epi16(a: __m256i) -> i32 { static_assert_uimm_bits!(INDEX, 4); - simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 + unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 } } #[allow(improper_ctypes)]