From 1dcba9edde29458d12d35a915a2ad9d36bfcf33a Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Sun, 7 May 2023 10:52:00 -0600 Subject: [PATCH] Implement _mm256_i32scatter_epi64 from AVX512VL --- library/stdarch/crates/core_arch/avx512f.md | 2 +- .../crates/core_arch/src/x86/avx512f.rs | 22 +++++++++++++++++++ .../crates/core_arch/src/x86_64/avx512f.rs | 14 ++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/library/stdarch/crates/core_arch/avx512f.md b/library/stdarch/crates/core_arch/avx512f.md index 6cb6e6564071..2435583bfa66 100644 --- a/library/stdarch/crates/core_arch/avx512f.md +++ b/library/stdarch/crates/core_arch/avx512f.md @@ -1519,7 +1519,7 @@ * [x] [`_mm512_mask_i32scatter_epi64`] * [_] [`_mm_i32scatter_epi64`]//need i1 * [_] [`_mm_mask_i32scatter_epi64`] //need i1 - * [_] [`_mm256_i32scatter_epi64`] //need i1 + * [x] [`_mm256_i32scatter_epi64`] * [_] [`_mm256_mask_i32scatter_epi64`] //need i1 * [x] [`_mm512_i32scatter_ps`] * [x] [`_mm512_mask_i32scatter_ps`] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index 9b38a9f35266..9baa7eeca040 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -15757,6 +15757,26 @@ pub unsafe fn _mm512_mask_i32scatter_epi64( vpscatterdq(slice, mask, offsets, src, SCALE); } +/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm256_i32scatter_epi64( + slice: *mut u8, + offsets: __m128i, + src: __m256i, +) { + static_assert_imm8_scale!(SCALE); + let src = src.as_i64x4(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x4(); + vpscatterdq256(slice, neg_one, offsets, src, SCALE); +} + /// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116) @@ -38307,6 +38327,8 @@ extern "C" { fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32); #[link_name = "llvm.x86.avx512.scatter.dpq.512"] fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv4.di"] + fn vpscatterdq256(slice: *mut i8, mask: i8, offsets: i32x4, src: i64x4, scale: i32); #[link_name = "llvm.x86.avx512.scatter.dpi.512"] fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32); diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs index d414effa7ba1..68f3327677ee 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs @@ -7551,6 +7551,20 @@ mod tests { assert_eq!(&arr[..], &expected[..],); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i32scatter_epi64() { + let mut arr = [0i64; 64]; + let index = _mm_setr_epi32(0, 16, 32, 48); + let src = _mm256_setr_epi64x(1, 2, 3, 4); + // A multiplier of 8 is word-addressing + _mm256_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src); + let mut expected = [0i64; 64]; + for i in 0..4 { + expected[i * 16] = (i + 1) as i64; + } + assert_eq!(&arr[..], &expected[..],); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_i64scatter_epi64() { let mut arr = [0i64; 128];