From efedfe9fdd2f3c6a690d8026d4131a06ddb744db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Mi=C4=85sko?= Date: Fri, 5 Mar 2021 00:00:00 +0000 Subject: [PATCH] Convert _mm{256,_mask,}_permute_p{d,s} to const generics * _mm256_permute_pd * _mm256_permute_ps * _mm_mask_permute_pd * _mm_maskz_permute_pd * _mm_permute_pd * _mm_permute_ps --- .../stdarch/crates/core_arch/src/x86/avx.rs | 219 +++++------------- .../crates/core_arch/src/x86/avx512f.rs | 48 ++-- .../crates/core_arch/src/x86_64/avx512f.rs | 8 +- 3 files changed, 85 insertions(+), 190 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs index 8f040fc2f54a..860133e66cb1 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx.rs @@ -1087,57 +1087,25 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_ps) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 { - let imm8 = (imm8 & 0xFF) as u8; - let undefined = _mm256_undefined_ps(); - macro_rules! shuffle4 { - ($a:expr, $b:expr, $c:expr, $d:expr) => { - simd_shuffle8( - a, - undefined, - [$a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4], - ) - }; - } - macro_rules! shuffle3 { - ($a:expr, $b:expr, $c:expr) => { - match (imm8 >> 6) & 0b11 { - 0b00 => shuffle4!($a, $b, $c, 0), - 0b01 => shuffle4!($a, $b, $c, 1), - 0b10 => shuffle4!($a, $b, $c, 2), - _ => shuffle4!($a, $b, $c, 3), - } - }; - } - macro_rules! shuffle2 { - ($a:expr, $b:expr) => { - match (imm8 >> 4) & 0b11 { - 0b00 => shuffle3!($a, $b, 0), - 0b01 => shuffle3!($a, $b, 1), - 0b10 => shuffle3!($a, $b, 2), - _ => shuffle3!($a, $b, 3), - } - }; - } - macro_rules! shuffle1 { - ($a:expr) => { - match (imm8 >> 2) & 0b11 { - 0b00 => shuffle2!($a, 0), - 0b01 => shuffle2!($a, 1), - 0b10 => shuffle2!($a, 2), - _ => shuffle2!($a, 3), - } - }; - } - match imm8 & 0b11 { - 0b00 => shuffle1!(0), - 0b01 => shuffle1!(1), - 0b10 => shuffle1!(2), - _ => shuffle1!(3), - } +pub unsafe fn _mm256_permute_ps(a: __m256) -> __m256 { + static_assert_imm8!(IMM8); + simd_shuffle8( + a, + _mm256_undefined_ps(), + [ + (IMM8 as u32 >> 0) & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ((IMM8 as u32 >> 0) & 0b11) + 4, + ((IMM8 as u32 >> 2) & 0b11) + 4, + ((IMM8 as u32 >> 4) & 0b11) + 4, + ((IMM8 as u32 >> 6) & 0b11) + 4, + ], + ) } /// Shuffles single-precision (32-bit) floating-point elements in `a` @@ -1146,53 +1114,21 @@ pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_ps) #[inline] #[target_feature(enable = "avx,sse")] -#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 { - let imm8 = (imm8 & 0xFF) as u8; - let undefined = _mm_undefined_ps(); - macro_rules! shuffle4 { - ($a:expr, $b:expr, $c:expr, $d:expr) => { - simd_shuffle4(a, undefined, [$a, $b, $c, $d]) - }; - } - macro_rules! shuffle3 { - ($a:expr, $b:expr, $c:expr) => { - match (imm8 >> 6) & 0b11 { - 0b00 => shuffle4!($a, $b, $c, 0), - 0b01 => shuffle4!($a, $b, $c, 1), - 0b10 => shuffle4!($a, $b, $c, 2), - _ => shuffle4!($a, $b, $c, 3), - } - }; - } - macro_rules! shuffle2 { - ($a:expr, $b:expr) => { - match (imm8 >> 4) & 0b11 { - 0b00 => shuffle3!($a, $b, 0), - 0b01 => shuffle3!($a, $b, 1), - 0b10 => shuffle3!($a, $b, 2), - _ => shuffle3!($a, $b, 3), - } - }; - } - macro_rules! shuffle1 { - ($a:expr) => { - match (imm8 >> 2) & 0b11 { - 0b00 => shuffle2!($a, 0), - 0b01 => shuffle2!($a, 1), - 0b10 => shuffle2!($a, 2), - _ => shuffle2!($a, 3), - } - }; - } - match imm8 & 0b11 { - 0b00 => shuffle1!(0), - 0b01 => shuffle1!(1), - 0b10 => shuffle1!(2), - _ => shuffle1!(3), - } +pub unsafe fn _mm_permute_ps(a: __m128) -> __m128 { + static_assert_imm8!(IMM8); + simd_shuffle4( + a, + _mm_undefined_ps(), + [ + (IMM8 as u32 >> 0) & 0b11, + (IMM8 as u32 >> 2) & 0b11, + (IMM8 as u32 >> 4) & 0b11, + (IMM8 as u32 >> 6) & 0b11, + ], + ) } /// Shuffles double-precision (64-bit) floating-point elements in `a` @@ -1225,45 +1161,21 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_pd) #[inline] #[target_feature(enable = "avx")] -#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpermilpd, IMM4 = 0x1))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d { - let imm8 = (imm8 & 0xFF) as u8; - let undefined = _mm256_undefined_pd(); - macro_rules! shuffle4 { - ($a:expr, $b:expr, $c:expr, $d:expr) => { - simd_shuffle4(a, undefined, [$a, $b, $c, $d]) - }; - } - macro_rules! shuffle3 { - ($a:expr, $b:expr, $c:expr) => { - match (imm8 >> 3) & 0x1 { - 0 => shuffle4!($a, $b, $c, 2), - _ => shuffle4!($a, $b, $c, 3), - } - }; - } - macro_rules! shuffle2 { - ($a:expr, $b:expr) => { - match (imm8 >> 2) & 0x1 { - 0 => shuffle3!($a, $b, 2), - _ => shuffle3!($a, $b, 3), - } - }; - } - macro_rules! shuffle1 { - ($a:expr) => { - match (imm8 >> 1) & 0x1 { - 0 => shuffle2!($a, 0), - _ => shuffle2!($a, 1), - } - }; - } - match imm8 & 0x1 { - 0 => shuffle1!(0), - _ => shuffle1!(1), - } +pub unsafe fn _mm256_permute_pd(a: __m256d) -> __m256d { + static_assert_imm4!(IMM4); + simd_shuffle4( + a, + _mm256_undefined_pd(), + [ + ((IMM4 as u32 >> 0) & 1), + ((IMM4 as u32 >> 1) & 1), + ((IMM4 as u32 >> 2) & 1) + 2, + ((IMM4 as u32 >> 3) & 1) + 2, + ], + ) } /// Shuffles double-precision (64-bit) floating-point elements in `a` @@ -1272,29 +1184,16 @@ pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_pd) #[inline] #[target_feature(enable = "avx,sse2")] -#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0x1))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d { - let imm8 = (imm8 & 0xFF) as u8; - let undefined = _mm_undefined_pd(); - macro_rules! shuffle2 { - ($a:expr, $b:expr) => { - simd_shuffle2(a, undefined, [$a, $b]) - }; - } - macro_rules! shuffle1 { - ($a:expr) => { - match (imm8 >> 1) & 0x1 { - 0 => shuffle2!($a, 0), - _ => shuffle2!($a, 1), - } - }; - } - match imm8 & 0x1 { - 0 => shuffle1!(0), - _ => shuffle1!(1), - } +pub unsafe fn _mm_permute_pd(a: __m128d) -> __m128d { + static_assert_imm2!(IMM2); + simd_shuffle2( + a, + _mm_undefined_pd(), + [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1], + ) } /// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) @@ -3784,7 +3683,7 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm256_permute_ps() { let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); - let r = _mm256_permute_ps(a, 0x1b); + let r = _mm256_permute_ps::<0x1b>(a); let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.); assert_eq_m256(r, e); } @@ -3792,7 +3691,7 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm_permute_ps() { let a = _mm_setr_ps(4., 3., 2., 5.); - let r = _mm_permute_ps(a, 0x1b); + let r = _mm_permute_ps::<0x1b>(a); let e = _mm_setr_ps(5., 2., 3., 4.); assert_eq_m128(r, e); } @@ -3818,7 +3717,7 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm256_permute_pd() { let a = _mm256_setr_pd(4., 3., 2., 5.); - let r = _mm256_permute_pd(a, 5); + let r = _mm256_permute_pd::<5>(a); let e = _mm256_setr_pd(3., 4., 5., 2.); assert_eq_m256d(r, e); } @@ -3826,7 +3725,7 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm_permute_pd() { let a = _mm_setr_pd(4., 3.); - let r = _mm_permute_pd(a, 1); + let r = _mm_permute_pd::<1>(a); let e = _mm_setr_pd(3., 4.); assert_eq_m128d(r, e); } diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index 7bf8bdeae907..13e18d77ce68 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -19904,7 +19904,7 @@ pub unsafe fn _mm512_maskz_permute_ps(k: __mmask16, a: __m512, imm8: i32) -> __m pub unsafe fn _mm256_mask_permute_ps(src: __m256, k: __mmask8, a: __m256, imm8: i32) -> __m256 { macro_rules! call { ($imm8:expr) => { - _mm256_permute_ps(a, $imm8) + _mm256_permute_ps::<$imm8>(a) }; } let r = constify_imm8_sae!(imm8, call); @@ -19921,7 +19921,7 @@ pub unsafe fn _mm256_mask_permute_ps(src: __m256, k: __mmask8, a: __m256, imm8: pub unsafe fn _mm256_maskz_permute_ps(k: __mmask8, a: __m256, imm8: i32) -> __m256 { macro_rules! call { ($imm8:expr) => { - _mm256_permute_ps(a, $imm8) + _mm256_permute_ps::<$imm8>(a) }; } let r = constify_imm8_sae!(imm8, call); @@ -19939,7 +19939,7 @@ pub unsafe fn _mm256_maskz_permute_ps(k: __mmask8, a: __m256, imm8: i32) -> __m2 pub unsafe fn _mm_mask_permute_ps(src: __m128, k: __mmask8, a: __m128, imm8: i32) -> __m128 { macro_rules! call { ($imm8:expr) => { - _mm_permute_ps(a, $imm8) + _mm_permute_ps::<$imm8>(a) }; } let r = constify_imm8_sae!(imm8, call); @@ -19956,7 +19956,7 @@ pub unsafe fn _mm_mask_permute_ps(src: __m128, k: __mmask8, a: __m128, imm8: i32 pub unsafe fn _mm_maskz_permute_ps(k: __mmask8, a: __m128, imm8: i32) -> __m128 { macro_rules! call { ($imm8:expr) => { - _mm_permute_ps(a, $imm8) + _mm_permute_ps::<$imm8>(a) }; } let r = constify_imm8_sae!(imm8, call); @@ -20058,10 +20058,10 @@ pub unsafe fn _mm512_maskz_permute_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m pub unsafe fn _mm256_mask_permute_pd(src: __m256d, k: __mmask8, a: __m256d, imm8: i32) -> __m256d { macro_rules! call { ($imm8:expr) => { - _mm256_permute_pd(a, $imm8) + _mm256_permute_pd::<$imm8>(a) }; } - let r = constify_imm8_sae!(imm8, call); + let r = constify_imm4!(imm8, call); transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4())) } @@ -20075,10 +20075,10 @@ pub unsafe fn _mm256_mask_permute_pd(src: __m256d, k: __mmask8, a: __m256d, imm8 pub unsafe fn _mm256_maskz_permute_pd(k: __mmask8, a: __m256d, imm8: i32) -> __m256d { macro_rules! call { ($imm8:expr) => { - _mm256_permute_pd(a, $imm8) + _mm256_permute_pd::<$imm8>(a) }; } - let r = constify_imm8_sae!(imm8, call); + let r = constify_imm4!(imm8, call); let zero = _mm256_setzero_pd().as_f64x4(); transmute(simd_select_bitmask(k, r.as_f64x4(), zero)) } @@ -20088,15 +20088,15 @@ pub unsafe fn _mm256_maskz_permute_pd(k: __mmask8, a: __m256d, imm8: i32) -> __m /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permute_pd&expand=4153) #[inline] #[target_feature(enable = "avx512f,avx512vl")] -#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0b01))] -#[rustc_args_required_const(3)] -pub unsafe fn _mm_mask_permute_pd(src: __m128d, k: __mmask8, a: __m128d, imm8: i32) -> __m128d { - macro_rules! call { - ($imm8:expr) => { - _mm_permute_pd(a, $imm8) - }; - } - let r = constify_imm8_sae!(imm8, call); +#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_mask_permute_pd( + src: __m128d, + k: __mmask8, + a: __m128d, +) -> __m128d { + static_assert_imm2!(IMM2); + let r = _mm_permute_pd::(a); transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2())) } @@ -20105,15 +20105,11 @@ pub unsafe fn _mm_mask_permute_pd(src: __m128d, k: __mmask8, a: __m128d, imm8: i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permute_pd&expand=4154) #[inline] #[target_feature(enable = "avx512f,avx512vl")] -#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0b01))] -#[rustc_args_required_const(2)] -pub unsafe fn _mm_maskz_permute_pd(k: __mmask8, a: __m128d, imm8: i32) -> __m128d { - macro_rules! call { - ($imm8:expr) => { - _mm_permute_pd(a, $imm8) - }; - } - let r = constify_imm8_sae!(imm8, call); +#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_maskz_permute_pd(k: __mmask8, a: __m128d) -> __m128d { + static_assert_imm2!(IMM2); + let r = _mm_permute_pd::(a); let zero = _mm_setzero_pd().as_f64x2(); transmute(simd_select_bitmask(k, r.as_f64x2(), zero)) } diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs index caaf3e6d737a..3f14f5f901d5 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs @@ -9006,9 +9006,9 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_mask_permute_pd() { let a = _mm_set_pd(1., 0.); - let r = _mm_mask_permute_pd(a, 0, a, 0b1111); + let r = _mm_mask_permute_pd::<0b11>(a, 0, a); assert_eq_m128d(r, a); - let r = _mm_mask_permute_pd(a, 0b00000011, a, 0b1111); + let r = _mm_mask_permute_pd::<0b11>(a, 0b00000011, a); let e = _mm_set_pd(1., 1.); assert_eq_m128d(r, e); } @@ -9016,9 +9016,9 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_maskz_permute_pd() { let a = _mm_set_pd(1., 0.); - let r = _mm_maskz_permute_pd(0, a, 0b1111); + let r = _mm_maskz_permute_pd::<0b11>(0, a); assert_eq_m128d(r, _mm_setzero_pd()); - let r = _mm_maskz_permute_pd(0b00000011, a, 0b1111); + let r = _mm_maskz_permute_pd::<0b11>(0b00000011, a); let e = _mm_set_pd(1., 1.); assert_eq_m128d(r, e); }