From 9af0ddafacba6ec96bd8b5ae630751817f8dd3ed Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 10 Jun 2024 12:16:15 +0530 Subject: [PATCH] AVX512DQ Part 2: Broadcast, Extract, Insert --- library/stdarch/crates/core_arch/avx512dq.md | 140 +- .../crates/core_arch/src/x86/avx512dq.rs | 1781 ++++++++++++++++- 2 files changed, 1825 insertions(+), 96 deletions(-) diff --git a/library/stdarch/crates/core_arch/avx512dq.md b/library/stdarch/crates/core_arch/avx512dq.md index 82b66445a7c1..197dd8021fb8 100644 --- a/library/stdarch/crates/core_arch/avx512dq.md +++ b/library/stdarch/crates/core_arch/avx512dq.md @@ -1,5 +1,7 @@ ["AVX512DQ"]

+[Intel's List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512DQ) + - And: * [x] _mm_mask_and_pd * [x] _mm_maskz_and_pd @@ -69,39 +71,39 @@ - Broadcast - * [ ] _mm256_broadcast_f32x2 - * [ ] _mm256_mask_broadcast_f32x2 - * [ ] _mm256_maskz_broadcast_f32x2 - * [ ] _mm512_broadcast_f32x2 - * [ ] _mm512_mask_broadcast_f32x2 - * [ ] _mm512_maskz_broadcast_f32x2 - * [ ] _mm512_broadcast_f32x8 - * [ ] _mm512_mask_broadcast_f32x8 - * [ ] _mm512_maskz_broadcast_f32x8 - * [ ] _mm256_broadcast_f64x2 - * [ ] _mm256_mask_broadcast_f64x2 - * [ ] _mm256_maskz_broadcast_f64x2 - * [ ] _mm512_broadcast_f64x2 - * [ ] _mm512_mask_broadcast_f64x2 - * [ ] _mm512_maskz_broadcast_f64x2 - * [ ] _mm_broadcast_i32x2 - * [ ] _mm_mask_broadcast_i32x2 - * [ ] _mm_maskz_broadcast_i32x2 - * [ ] _mm256_broadcast_i32x2 - * [ ] _mm256_mask_broadcast_i32x2 - * [ ] _mm256_maskz_broadcast_i32x2 - * [ ] _mm512_broadcast_i32x2 - * [ ] _mm512_mask_broadcast_i32x2 - * [ ] _mm512_maskz_broadcast_i32x2 - * [ ] _mm512_broadcast_i32x8 - * [ ] _mm512_mask_broadcast_i32x8 - * [ ] _mm512_maskz_broadcast_i32x8 - * [ ] _mm256_broadcast_i64x2 - * [ ] _mm256_mask_broadcast_i64x2 - * [ ] _mm256_maskz_broadcast_i64x2 - * [ ] _mm512_broadcast_i64x2 - * [ ] _mm512_mask_broadcast_i64x2 - * [ ] _mm512_maskz_broadcast_i64x2 + * [x] _mm256_broadcast_f32x2 + * [x] _mm256_mask_broadcast_f32x2 + * [x] _mm256_maskz_broadcast_f32x2 + * [x] _mm512_broadcast_f32x2 + * [x] _mm512_mask_broadcast_f32x2 + * [x] _mm512_maskz_broadcast_f32x2 + * [x] _mm512_broadcast_f32x8 + * [x] _mm512_mask_broadcast_f32x8 + * [x] _mm512_maskz_broadcast_f32x8 + * [x] _mm256_broadcast_f64x2 + * [x] _mm256_mask_broadcast_f64x2 + * [x] _mm256_maskz_broadcast_f64x2 + * [x] _mm512_broadcast_f64x2 + * [x] _mm512_mask_broadcast_f64x2 + * [x] _mm512_maskz_broadcast_f64x2 + * [x] _mm_broadcast_i32x2 + * [x] _mm_mask_broadcast_i32x2 + * [x] _mm_maskz_broadcast_i32x2 + * [x] _mm256_broadcast_i32x2 + * [x] _mm256_mask_broadcast_i32x2 + * [x] _mm256_maskz_broadcast_i32x2 + * [x] _mm512_broadcast_i32x2 + * [x] _mm512_mask_broadcast_i32x2 + * [x] _mm512_maskz_broadcast_i32x2 + * [x] _mm512_broadcast_i32x8 + * [x] _mm512_mask_broadcast_i32x8 + * [x] _mm512_maskz_broadcast_i32x8 + * [x] _mm256_broadcast_i64x2 + * [x] _mm256_mask_broadcast_i64x2 + * [x] _mm256_maskz_broadcast_i64x2 + * [x] _mm512_broadcast_i64x2 + * [x] _mm512_mask_broadcast_i64x2 + * [x] _mm512_maskz_broadcast_i64x2 - Convert: @@ -252,45 +254,45 @@ - Element Extract: - * [ ] _mm512_extractf32x8_ps - * [ ] _mm512_mask_extractf32x8_ps - * [ ] _mm512_maskz_extractf32x8_ps - * [ ] _mm256_extractf64x2_pd - * [ ] _mm256_mask_extractf64x2_pd - * [ ] _mm256_maskz_extractf64x2_pd - * [ ] _mm512_extractf64x2_pd - * [ ] _mm512_mask_extractf64x2_pd - * [ ] _mm512_maskz_extractf64x2_pd - * [ ] _mm512_extracti32x8_epi32 - * [ ] _mm512_mask_extracti32x8_epi32 - * [ ] _mm512_maskz_extracti32x8_epi32 - * [ ] _mm256_extracti64x2_epi64 - * [ ] _mm256_mask_extracti64x2_epi64 - * [ ] _mm256_maskz_extracti64x2_epi64 - * [ ] _mm512_extracti64x2_epi64 - * [ ] _mm512_mask_extracti64x2_epi64 - * [ ] _mm512_maskz_extracti64x2_epi64 + * [x] _mm512_extractf32x8_ps + * [x] _mm512_mask_extractf32x8_ps + * [x] _mm512_maskz_extractf32x8_ps + * [x] _mm256_extractf64x2_pd + * [x] _mm256_mask_extractf64x2_pd + * [x] _mm256_maskz_extractf64x2_pd + * [x] _mm512_extractf64x2_pd + * [x] _mm512_mask_extractf64x2_pd + * [x] _mm512_maskz_extractf64x2_pd + * [x] _mm512_extracti32x8_epi32 + * [x] _mm512_mask_extracti32x8_epi32 + * [x] _mm512_maskz_extracti32x8_epi32 + * [x] _mm256_extracti64x2_epi64 + * [x] _mm256_mask_extracti64x2_epi64 + * [x] _mm256_maskz_extracti64x2_epi64 + * [x] _mm512_extracti64x2_epi64 + * [x] _mm512_mask_extracti64x2_epi64 + * [x] _mm512_maskz_extracti64x2_epi64 - Element Insert: - * [ ] _mm512_insertf32x8 - * [ ] _mm512_mask_insertf32x8 - * [ ] _mm512_maskz_insertf32x8 - * [ ] _mm256_insertf64x2 - * [ ] _mm256_mask_insertf64x2 - * [ ] _mm256_maskz_insertf64x2 - * [ ] _mm512_insertf64x2 - * [ ] _mm512_mask_insertf64x2 - * [ ] _mm512_maskz_insertf64x2 - * [ ] _mm512_inserti32x8 - * [ ] _mm512_mask_inserti32x8 - * [ ] _mm512_maskz_inserti32x8 - * [ ] _mm256_inserti64x2 - * [ ] _mm256_mask_inserti64x2 - * [ ] _mm256_maskz_inserti64x2 - * [ ] _mm512_inserti64x2 - * [ ] _mm512_mask_inserti64x2 - * [ ] _mm512_maskz_inserti64x2 + * [x] _mm512_insertf32x8 + * [x] _mm512_mask_insertf32x8 + * [x] _mm512_maskz_insertf32x8 + * [x] _mm256_insertf64x2 + * [x] _mm256_mask_insertf64x2 + * [x] _mm256_maskz_insertf64x2 + * [x] _mm512_insertf64x2 + * [x] _mm512_mask_insertf64x2 + * [x] _mm512_maskz_insertf64x2 + * [x] _mm512_inserti32x8 + * [x] _mm512_mask_inserti32x8 + * [x] _mm512_maskz_inserti32x8 + * [x] _mm256_inserti64x2 + * [x] _mm256_mask_inserti64x2 + * [x] _mm256_maskz_inserti64x2 + * [x] _mm512_inserti64x2 + * [x] _mm512_mask_inserti64x2 + * [x] _mm512_maskz_inserti64x2 - FP-Class diff --git a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs index c4ff97035cfa..a5fb77791d88 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs @@ -32,7 +32,6 @@ pub unsafe fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { transmute(simd_select_bitmask(k, and, zero)) } - /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b /// and store the results in dst using writemask k (elements are copied from src if the corresponding /// bit is not set). @@ -350,7 +349,7 @@ pub unsafe fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 { /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then /// bitwise AND with b and store the results in dst using writemask k (elements are copied from src if the /// corresponding bit is not set). -/// +/// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_andnot_ps&ig_expand=341) #[inline] #[target_feature(enable = "avx512dq")] @@ -363,7 +362,7 @@ pub unsafe fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then /// bitwise AND with b and store the results in dst using zeromask k (elements are zeroed out if the /// corresponding bit is not set). -/// +/// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_andnot_ps&ig_expand=342) #[inline] #[target_feature(enable = "avx512dq")] @@ -740,6 +739,1071 @@ pub unsafe fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 transmute(simd_select_bitmask(k, xor, zero)) } +// Broadcast + +/// Broadcasts the lower 2 packed single-precsion (32-bit) floating-point elements from a to all +/// elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f32x2&ig_expand=509) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcastf32x2))] +pub unsafe fn _mm256_broadcast_f32x2(a: __m128) -> __m256 { + let b: f32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the lower 2 packed single-precsion (32-bit) floating-point elements from a to all +/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f32x2&ig_expand=510) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcastf32x2))] +pub unsafe fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -> __m256 { + let b = _mm256_broadcast_f32x2(a).as_f32x8(); + transmute(simd_select_bitmask(k, b, src.as_f32x8())) +} + +/// Broadcasts the lower 2 packed single-precsion (32-bit) floating-point elements from a to all +/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f32x2&ig_expand=511) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcastf32x2))] +pub unsafe fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 { + let b = _mm256_broadcast_f32x2(a).as_f32x8(); + let zero = _mm256_setzero_ps().as_f32x8(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the lower 2 packed single-precsion (32-bit) floating-point elements from a to all +/// elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x2&ig_expand=512) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vbroadcastf32x2))] +pub unsafe fn _mm512_broadcast_f32x2(a: __m128) -> __m512 { + let b: f32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the lower 2 packed single-precsion (32-bit) floating-point elements from a to all +/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x2&ig_expand=513) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vbroadcastf32x2))] +pub unsafe fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128) -> __m512 { + let b = _mm512_broadcast_f32x2(a).as_f32x16(); + transmute(simd_select_bitmask(k, b, src.as_f32x16())) +} + +/// Broadcasts the lower 2 packed single-precsion (32-bit) floating-point elements from a to all +/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x2&ig_expand=514) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vbroadcastf32x2))] +pub unsafe fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 { + let b = _mm512_broadcast_f32x2(a).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all +/// elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f32x8&ig_expand=521) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_broadcast_f32x8(a: __m256) -> __m512 { + let b: f32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]); + transmute(b) +} + +/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all +/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f32x8&ig_expand=522) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256) -> __m512 { + let b = _mm512_broadcast_f32x8(a).as_f32x16(); + transmute(simd_select_bitmask(k, b, src.as_f32x16())) +} + +/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all +/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f32x8&ig_expand=523) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 { + let b = _mm512_broadcast_f32x8(a).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all +/// elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_f64x2&ig_expand=524) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +pub unsafe fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d { + let b: f64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all +/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_f64x2&ig_expand=525) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +pub unsafe fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d) -> __m256d { + let b = _mm256_broadcast_f64x2(a).as_f64x4(); + transmute(simd_select_bitmask(k, b, src.as_f64x4())) +} + +/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all +/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_f64x2&ig_expand=526) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +pub unsafe fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d { + let b = _mm256_broadcast_f64x2(a).as_f64x4(); + let zero = _mm256_setzero_pd().as_f64x4(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all +/// elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_f64x2&ig_expand=527) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d { + let b: f64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all +/// elements of dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_f64x2&ig_expand=528) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d) -> __m512d { + let b = _mm512_broadcast_f64x2(a).as_f64x8(); + transmute(simd_select_bitmask(k, b, src.as_f64x8())) +} + +/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all +/// elements of dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_f64x2&ig_expand=529) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d { + let b = _mm512_broadcast_f64x2(a).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_i32x2&ig_expand=533) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm_broadcast_i32x2(a: __m128i) -> __m128i { + let a = a.as_i32x4(); + let b: i32x4 = simd_shuffle!(a, a, [0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_broadcast_i32x2&ig_expand=534) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { + let b = _mm_broadcast_i32x2(a).as_i32x4(); + transmute(simd_select_bitmask(k, b, src.as_i32x4())) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_broadcast_i32x2&ig_expand=535) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i { + let b = _mm_broadcast_i32x2(a).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i32x2&ig_expand=536) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i { + let a = a.as_i32x4(); + let b: i32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i32x2&ig_expand=537) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i { + let b = _mm256_broadcast_i32x2(a).as_i32x8(); + transmute(simd_select_bitmask(k, b, src.as_i32x8())) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i32x2&ig_expand=538) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i { + let b = _mm256_broadcast_i32x2(a).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x2&ig_expand=539) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i { + let a = a.as_i32x4(); + let b: i32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x2&ig_expand=540) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { + let b = _mm512_broadcast_i32x2(a).as_i32x16(); + transmute(simd_select_bitmask(k, b, src.as_i32x16())) +} + +/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x2&ig_expand=541) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vbroadcasti32x2))] +pub unsafe fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i { + let b = _mm512_broadcast_i32x2(a).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i32x8&ig_expand=548) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i { + let a = a.as_i32x8(); + let b: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]); + transmute(b) +} + +/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i32x8&ig_expand=549) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i) -> __m512i { + let b = _mm512_broadcast_i32x8(a).as_i32x16(); + transmute(simd_select_bitmask(k, b, src.as_i32x16())) +} + +/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i32x8&ig_expand=550) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i { + let b = _mm512_broadcast_i32x8(a).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_i64x2&ig_expand=551) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +pub unsafe fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i { + let a = a.as_i64x2(); + let b: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_broadcast_i64x2&ig_expand=552) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +pub unsafe fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i { + let b = _mm256_broadcast_i64x2(a).as_i64x4(); + transmute(simd_select_bitmask(k, b, src.as_i64x4())) +} + +/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_broadcast_i64x2&ig_expand=553) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +pub unsafe fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i { + let b = _mm256_broadcast_i64x2(a).as_i64x4(); + let zero = _mm256_setzero_si256().as_i64x4(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_broadcast_i64x2&ig_expand=554) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i { + let a = a.as_i64x2(); + let b: i64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]); + transmute(b) +} + +/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_broadcast_i64x2&ig_expand=555) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i) -> __m512i { + let b = _mm512_broadcast_i64x2(a).as_i64x8(); + transmute(simd_select_bitmask(k, b, src.as_i64x8())) +} + +/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_broadcast_i64x2&ig_expand=556) +#[inline] +#[target_feature(enable = "avx512dq")] +pub unsafe fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i { + let b = _mm512_broadcast_i64x2(a).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, b, zero)) +} + +// Extract + +/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf32x8_ps&ig_expand=2946) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm512_extractf32x8_ps(a: __m512) -> __m256 { + static_assert_uimm_bits!(IMM8, 1); + match IMM8 & 1 { + 0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]), + _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]), + } +} + +/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src +/// if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf32x8_ps&ig_expand=2947) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_mask_extractf32x8_ps( + src: __m256, + k: __mmask8, + a: __m512, +) -> __m256 { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm512_extractf32x8_ps::(a); + transmute(simd_select_bitmask(k, b.as_f32x8(), src.as_f32x8())) +} + +/// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the +/// corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf32x8_ps&ig_expand=2948) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_maskz_extractf32x8_ps(k: __mmask8, a: __m512) -> __m256 { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm512_extractf32x8_ps::(a); + let zero = _mm256_setzero_ps().as_f32x8(); + transmute(simd_select_bitmask(k, b.as_f32x8(), zero)) +} + +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf64x2_pd&ig_expand=2949) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm256_extractf64x2_pd(a: __m256d) -> __m128d { + static_assert_uimm_bits!(IMM8, 1); + match IMM8 & 1 { + 0 => simd_shuffle!(a, a, [0, 1]), + _ => simd_shuffle!(a, a, [2, 3]), + } +} + +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src +/// if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extractf64x2_pd&ig_expand=2950) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm256_mask_extractf64x2_pd( + src: __m128d, + k: __mmask8, + a: __m256d, +) -> __m128d { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm256_extractf64x2_pd::(a); + transmute(simd_select_bitmask(k, b.as_f64x2(), src.as_f64x2())) +} + +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the +/// corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extractf64x2_pd&ig_expand=2951) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm256_maskz_extractf64x2_pd(k: __mmask8, a: __m256d) -> __m128d { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm256_extractf64x2_pd::(a); + let zero = _mm_setzero_pd().as_f64x2(); + transmute(simd_select_bitmask(k, b.as_f64x2(), zero)) +} + +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extractf64x2_pd&ig_expand=2952) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm512_extractf64x2_pd(a: __m512d) -> __m128d { + static_assert_uimm_bits!(IMM8, 2); + match IMM8 & 3 { + 0 => simd_shuffle!(a, a, [0, 1]), + 1 => simd_shuffle!(a, a, [2, 3]), + 2 => simd_shuffle!(a, a, [4, 5]), + _ => simd_shuffle!(a, a, [6, 7]), + } +} + +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst using writemask k (elements are copied from src +/// if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extractf64x2_pd&ig_expand=2953) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_mask_extractf64x2_pd( + src: __m128d, + k: __mmask8, + a: __m512d, +) -> __m128d { + static_assert_uimm_bits!(IMM8, 2); + let b = _mm512_extractf64x2_pd::(a).as_f64x2(); + transmute(simd_select_bitmask(k, b, src.as_f64x2())) +} + +/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, +/// selected with IMM8, and stores the result in dst using zeromask k (elements are zeroed out if the +/// corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extractf64x2_pd&ig_expand=2954) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_maskz_extractf64x2_pd(k: __mmask8, a: __m512d) -> __m128d { + static_assert_uimm_bits!(IMM8, 2); + let b = _mm512_extractf64x2_pd::(a).as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores +/// the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti32x8_epi32&ig_expand=2965) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm512_extracti32x8_epi32(a: __m512i) -> __m256i { + static_assert_uimm_bits!(IMM8, 1); + let a = a.as_i32x16(); + let b: i32x8 = match IMM8 & 1 { + 0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]), + _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]), + }; + transmute(b) +} + +/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores +/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti32x8_epi32&ig_expand=2966) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_mask_extracti32x8_epi32( + src: __m256i, + k: __mmask8, + a: __m512i, +) -> __m256i { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm512_extracti32x8_epi32::(a).as_i32x8(); + transmute(simd_select_bitmask(k, b, src.as_i32x8())) +} + +/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores +/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti32x8_epi32&ig_expand=2967) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_maskz_extracti32x8_epi32(k: __mmask8, a: __m512i) -> __m256i { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm512_extracti32x8_epi32::(a).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores +/// the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti64x2_epi64&ig_expand=2968) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm256_extracti64x2_epi64(a: __m256i) -> __m128i { + static_assert_uimm_bits!(IMM8, 1); + match IMM8 & 1 { + 0 => simd_shuffle!(a, a, [0, 1]), + _ => simd_shuffle!(a, a, [2, 3]), + } +} + +/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores +/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_extracti64x2_epi64&ig_expand=2969) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm256_mask_extracti64x2_epi64( + src: __m128i, + k: __mmask8, + a: __m256i, +) -> __m128i { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm256_extracti64x2_epi64::(a).as_i64x2(); + transmute(simd_select_bitmask(k, b, src.as_i64x2())) +} + +/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores +/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_extracti64x2_epi64&ig_expand=2970) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm256_maskz_extracti64x2_epi64(k: __mmask8, a: __m256i) -> __m128i { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm256_extracti64x2_epi64::(a).as_i64x2(); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, b, zero)) +} + +/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores +/// the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_extracti64x2_epi64&ig_expand=2971) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm512_extracti64x2_epi64(a: __m512i) -> __m128i { + static_assert_uimm_bits!(IMM8, 2); + match IMM8 & 3 { + 0 => simd_shuffle!(a, a, [0, 1]), + 1 => simd_shuffle!(a, a, [2, 3]), + 2 => simd_shuffle!(a, a, [4, 5]), + _ => simd_shuffle!(a, a, [6, 7]), + } +} + +/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores +/// the result in dst using writemask k (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_extracti64x2_epi64&ig_expand=2972) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_mask_extracti64x2_epi64( + src: __m128i, + k: __mmask8, + a: __m512i, +) -> __m128i { + static_assert_uimm_bits!(IMM8, 2); + let b = _mm512_extracti64x2_epi64::(a).as_i64x2(); + transmute(simd_select_bitmask(k, b, src.as_i64x2())) +} + +/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores +/// the result in dst using zeromask k (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_extracti64x2_epi64&ig_expand=2973) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_maskz_extracti64x2_epi64(k: __mmask8, a: __m512i) -> __m128i { + static_assert_uimm_bits!(IMM8, 2); + let b = _mm512_extracti64x2_epi64::(a).as_i64x2(); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, b, zero)) +} + +// Insert + +/// Copy a to dst, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point +/// elements) from b into dst at the location specified by IMM8. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf32x8&ig_expand=3850) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_insertf32x8(a: __m512, b: __m256) -> __m512 { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm512_castps256_ps512(b); + match IMM8 & 1 { + 0 => simd_shuffle!( + a, + b, + [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15] + ), + _ => simd_shuffle!( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] + ), + } +} + +/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point +/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf32x8&ig_expand=3851) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm512_mask_insertf32x8( + src: __m512, + k: __mmask16, + a: __m512, + b: __m256, +) -> __m512 { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm512_insertf32x8::(a, b); + transmute(simd_select_bitmask(k, c.as_f32x16(), src.as_f32x16())) +} + +/// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point +/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf32x8&ig_expand=3852) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_maskz_insertf32x8( + k: __mmask16, + a: __m512, + b: __m256, +) -> __m512 { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm512_insertf32x8::(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, c, zero)) +} + +/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point +/// elements) from b into dst at the location specified by IMM8. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf64x2&ig_expand=3853) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm256_insertf64x2(a: __m256d, b: __m128d) -> __m256d { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm256_castpd128_pd256(b); + match IMM8 & 1 { + 0 => simd_shuffle!(a, b, [4, 5, 2, 3]), + _ => simd_shuffle!(a, b, [0, 1, 4, 5]), + } +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point +/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_insertf64x2&ig_expand=3854) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm256_mask_insertf64x2( + src: __m256d, + k: __mmask8, + a: __m256d, + b: __m128d, +) -> __m256d { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm256_insertf64x2::(a, b); + transmute(simd_select_bitmask(k, c.as_f64x4(), src.as_f64x4())) +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point +/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_insertf64x2&ig_expand=3855) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm256_maskz_insertf64x2( + k: __mmask8, + a: __m256d, + b: __m128d, +) -> __m256d { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm256_insertf64x2::(a, b).as_f64x4(); + let zero = _mm256_setzero_pd().as_f64x4(); + transmute(simd_select_bitmask(k, c, zero)) +} + +/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point +/// elements) from b into dst at the location specified by IMM8. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_insertf64x2&ig_expand=3856) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_insertf64x2(a: __m512d, b: __m128d) -> __m512d { + static_assert_uimm_bits!(IMM8, 2); + let b = _mm512_castpd128_pd512(b); + match IMM8 & 3 { + 0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]), + 1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]), + 2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]), + _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]), + } +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point +/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using writemask k +/// (elements are copied from src if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_insertf64x2&ig_expand=3857) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm512_mask_insertf64x2( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m128d, +) -> __m512d { + static_assert_uimm_bits!(IMM8, 2); + let c = _mm512_insertf64x2::(a, b); + transmute(simd_select_bitmask(k, c.as_f64x8(), src.as_f64x8())) +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point +/// elements) from b into tmp at the location specified by IMM8, and copy tmp to dst using zeromask k +/// (elements are zeroed out if the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_insertf64x2&ig_expand=3858) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_maskz_insertf64x2( + k: __mmask8, + a: __m512d, + b: __m128d, +) -> __m512d { + static_assert_uimm_bits!(IMM8, 2); + let c = _mm512_insertf64x2::(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, c, zero)) +} + +/// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the +/// location specified by IMM8. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti32x8&ig_expand=3869) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_inserti32x8(a: __m512i, b: __m256i) -> __m512i { + static_assert_uimm_bits!(IMM8, 1); + let a = a.as_i32x16(); + let b = _mm512_castsi256_si512(b).as_i32x16(); + let r: i32x16 = match IMM8 & 1 { + 0 => simd_shuffle!( + a, + b, + [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15] + ), + _ => simd_shuffle!( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] + ), + }; + transmute(r) +} + +/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the +/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if +/// the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti32x8&ig_expand=3870) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm512_mask_inserti32x8( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m256i, +) -> __m512i { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm512_inserti32x8::(a, b); + transmute(simd_select_bitmask(k, c.as_i32x16(), src.as_i32x16())) +} + +/// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the +/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the +/// corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti32x8&ig_expand=3871) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_maskz_inserti32x8( + k: __mmask16, + a: __m512i, + b: __m256i, +) -> __m512i { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm512_inserti32x8::(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, c, zero)) +} + +/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the +/// location specified by IMM8. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti64x2&ig_expand=3872) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm256_inserti64x2(a: __m256i, b: __m128i) -> __m256i { + static_assert_uimm_bits!(IMM8, 1); + let b = _mm256_castsi128_si256(b); + match IMM8 & 1 { + 0 => simd_shuffle!(a, b, [4, 5, 2, 3]), + _ => simd_shuffle!(a, b, [0, 1, 4, 5]), + } +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the +/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if +/// the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_inserti64x2&ig_expand=3873) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm256_mask_inserti64x2( + src: __m256i, + k: __mmask8, + a: __m256i, + b: __m128i, +) -> __m256i { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm256_inserti64x2::(a, b); + transmute(simd_select_bitmask(k, c.as_i64x4(), src.as_i64x4())) +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the +/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the +/// corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_inserti64x2&ig_expand=3874) +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm256_maskz_inserti64x2( + k: __mmask8, + a: __m256i, + b: __m128i, +) -> __m256i { + static_assert_uimm_bits!(IMM8, 1); + let c = _mm256_inserti64x2::(a, b).as_i64x4(); + let zero = _mm256_setzero_si256().as_i64x4(); + transmute(simd_select_bitmask(k, c, zero)) +} + +/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the +/// location specified by IMM8. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_inserti64x2&ig_expand=3875) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm512_inserti64x2(a: __m512i, b: __m128i) -> __m512i { + static_assert_uimm_bits!(IMM8, 2); + let b = _mm512_castsi128_si512(b); + match IMM8 & 3 { + 0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]), + 1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]), + 2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]), + _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]), + } +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the +/// location specified by IMM8, and copy tmp to dst using writemask k (elements are copied from src if +/// the corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_inserti64x2&ig_expand=3876) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm512_mask_inserti64x2( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m128i, +) -> __m512i { + static_assert_uimm_bits!(IMM8, 2); + let c = _mm512_inserti64x2::(a, b); + transmute(simd_select_bitmask(k, c.as_i64x8(), src.as_i64x8())) +} + +/// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the +/// location specified by IMM8, and copy tmp to dst using zeromask k (elements are zeroed out if the +/// corresponding bit is not set). +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_inserti64x2&ig_expand=3877) +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm512_maskz_inserti64x2( + k: __mmask8, + a: __m512i, + b: __m128i, +) -> __m512i { + static_assert_uimm_bits!(IMM8, 2); + let c = _mm512_inserti64x2::(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, c, zero)) +} + #[cfg(test)] mod tests { use super::*; @@ -748,22 +1812,23 @@ mod tests { use crate::core_arch::x86::*; use crate::core_arch::x86_64::*; + use crate::mem::transmute; - const OPRND1_64: f64 = f64::from_bits(0x3333333333333333); - const OPRND2_64: f64 = f64::from_bits(0x5555555555555555); + const OPRND1_64: f64 = unsafe { transmute(0x3333333333333333_u64) }; + const OPRND2_64: f64 = unsafe { transmute(0x5555555555555555_u64) }; - const AND_64: f64 = f64::from_bits(0x1111111111111111); - const ANDN_64: f64 = f64::from_bits(0x4444444444444444); - const OR_64: f64 = f64::from_bits(0x7777777777777777); - const XOR_64: f64 = f64::from_bits(0x6666666666666666); + const AND_64: f64 = unsafe { transmute(0x1111111111111111_u64) }; + const ANDN_64: f64 = unsafe { transmute(0x4444444444444444_u64) }; + const OR_64: f64 = unsafe { transmute(0x7777777777777777_u64) }; + const XOR_64: f64 = unsafe { transmute(0x6666666666666666_u64) }; - const OPRND1_32: f32 = f32::from_bits(0x33333333); - const OPRND2_32: f32 = f32::from_bits(0x55555555); + const OPRND1_32: f32 = unsafe { transmute(0x33333333_u32) }; + const OPRND2_32: f32 = unsafe { transmute(0x55555555_u32) }; - const AND_32: f32 = f32::from_bits(0x11111111); - const ANDN_32: f32 = f32::from_bits(0x44444444); - const OR_32: f32 = f32::from_bits(0x77777777); - const XOR_32: f32 = f32::from_bits(0x66666666); + const AND_32: f32 = unsafe { transmute(0x11111111_u32) }; + const ANDN_32: f32 = unsafe { transmute(0x44444444_u32) }; + const OR_32: f32 = unsafe { transmute(0x77777777_u32) }; + const XOR_32: f32 = unsafe { transmute(0x66666666_u32) }; #[simd_test(enable = "avx512dq,avx512vl")] unsafe fn test_mm_mask_and_pd() { @@ -882,9 +1947,14 @@ mod tests { unsafe fn test_mm512_mask_and_ps() { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); - let src = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.); + let src = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); let r = _mm512_mask_and_ps(src, 0b0101010101010101, a, b); - let e = _mm512_set_ps(1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32, 9., AND_32, 11., AND_32, 13., AND_32, 15., AND_32); + let e = _mm512_set_ps( + 1., AND_32, 3., AND_32, 5., AND_32, 7., AND_32, 9., AND_32, 11., AND_32, 13., AND_32, + 15., AND_32, + ); assert_eq_m512(r, e); } @@ -893,7 +1963,10 @@ mod tests { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); let r = _mm512_maskz_and_ps(0b0101010101010101, a, b); - let e = _mm512_set_ps(0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32); + let e = _mm512_set_ps( + 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., AND_32, 0., + AND_32, + ); assert_eq_m512(r, e); } @@ -1014,9 +2087,14 @@ mod tests { unsafe fn test_mm512_mask_andnot_ps() { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); - let src = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.); + let src = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); let r = _mm512_mask_andnot_ps(src, 0b0101010101010101, a, b); - let e = _mm512_set_ps(1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32, 9., ANDN_32, 11., ANDN_32, 13., ANDN_32, 15., ANDN_32); + let e = _mm512_set_ps( + 1., ANDN_32, 3., ANDN_32, 5., ANDN_32, 7., ANDN_32, 9., ANDN_32, 11., ANDN_32, 13., + ANDN_32, 15., ANDN_32, + ); assert_eq_m512(r, e); } @@ -1025,7 +2103,10 @@ mod tests { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); let r = _mm512_maskz_andnot_ps(0b0101010101010101, a, b); - let e = _mm512_set_ps(0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32); + let e = _mm512_set_ps( + 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., ANDN_32, 0., + ANDN_32, 0., ANDN_32, + ); assert_eq_m512(r, e); } @@ -1146,9 +2227,14 @@ mod tests { unsafe fn test_mm512_mask_or_ps() { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); - let src = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.); + let src = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); let r = _mm512_mask_or_ps(src, 0b0101010101010101, a, b); - let e = _mm512_set_ps(1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32, 9., OR_32, 11., OR_32, 13., OR_32, 15., OR_32); + let e = _mm512_set_ps( + 1., OR_32, 3., OR_32, 5., OR_32, 7., OR_32, 9., OR_32, 11., OR_32, 13., OR_32, 15., + OR_32, + ); assert_eq_m512(r, e); } @@ -1157,7 +2243,9 @@ mod tests { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); let r = _mm512_maskz_or_ps(0b0101010101010101, a, b); - let e = _mm512_set_ps(0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32); + let e = _mm512_set_ps( + 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, 0., OR_32, + ); assert_eq_m512(r, e); } @@ -1278,9 +2366,14 @@ mod tests { unsafe fn test_mm512_mask_xor_ps() { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); - let src = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.); + let src = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); let r = _mm512_mask_xor_ps(src, 0b0101010101010101, a, b); - let e = _mm512_set_ps(1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32, 9., XOR_32, 11., XOR_32, 13., XOR_32, 15., XOR_32); + let e = _mm512_set_ps( + 1., XOR_32, 3., XOR_32, 5., XOR_32, 7., XOR_32, 9., XOR_32, 11., XOR_32, 13., XOR_32, + 15., XOR_32, + ); assert_eq_m512(r, e); } @@ -1289,9 +2382,643 @@ mod tests { let a = _mm512_set1_ps(OPRND1_32); let b = _mm512_set1_ps(OPRND2_32); let r = _mm512_maskz_xor_ps(0b0101010101010101, a, b); - let e = _mm512_set_ps(0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32); + let e = _mm512_set_ps( + 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., XOR_32, 0., + XOR_32, + ); assert_eq_m512(r, e); } + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_broadcast_f32x2() { + let a = _mm_set_ps(1., 2., 3., 4.); + let r = _mm256_broadcast_f32x2(a); + let e = _mm256_set_ps(1., 2., 1., 2., 1., 2., 1., 2.); + assert_eq_m256(r, e); + } + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_broadcast_f32x2() { + let a = _mm_set_ps(1., 2., 3., 4.); + let b = _mm256_set_ps(5., 6., 7., 8., 9., 10., 11., 12.); + let r = _mm256_mask_broadcast_f32x2(b, 0b01101001, a); + let e = _mm256_set_ps(5., 2., 1., 8., 1., 10., 11., 2.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_broadcast_f32x2() { + let a = _mm_set_ps(1., 2., 3., 4.); + let r = _mm256_maskz_broadcast_f32x2(0b01101001, a); + let e = _mm256_set_ps(0., 2., 1., 0., 1., 0., 0., 2.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_broadcast_f32x2() { + let a = _mm_set_ps(1., 2., 3., 4.); + let r = _mm512_broadcast_f32x2(a); + let e = _mm512_set_ps( + 1., 2., 1., 2., 1., 2., 1., 2., 1., 2., 1., 2., 1., 2., 1., 2., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_broadcast_f32x2() { + let a = _mm_set_ps(1., 2., 3., 4.); + let b = _mm512_set_ps( + 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., + ); + let r = _mm512_mask_broadcast_f32x2(b, 0b0110100100111100, a); + let e = _mm512_set_ps( + 5., 2., 1., 8., 1., 10., 11., 2., 13., 14., 1., 2., 1., 2., 19., 20., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_broadcast_f32x2() { + let a = _mm_set_ps(1., 2., 3., 4.); + let r = _mm512_maskz_broadcast_f32x2(0b0110100100111100, a); + let e = _mm512_set_ps( + 0., 2., 1., 0., 1., 0., 0., 2., 0., 0., 1., 2., 1., 2., 0., 0., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_broadcast_f32x8() { + let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_broadcast_f32x8(a); + let e = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 1., 2., 3., 4., 5., 6., 7., 8., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_broadcast_f32x8() { + let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_ps( + 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., + ); + let r = _mm512_mask_broadcast_f32x8(b, 0b0110100100111100, a); + let e = _mm512_set_ps( + 9., 2., 3., 12., 5., 14., 15., 8., 17., 18., 3., 4., 5., 6., 23., 24., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_broadcast_f32x8() { + let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_maskz_broadcast_f32x8(0b0110100100111100, a); + let e = _mm512_set_ps( + 0., 2., 3., 0., 5., 0., 0., 8., 0., 0., 3., 4., 5., 6., 0., 0., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_broadcast_f64x2() { + let a = _mm_set_pd(1., 2.); + let r = _mm256_broadcast_f64x2(a); + let e = _mm256_set_pd(1., 2., 1., 2.); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_broadcast_f64x2() { + let a = _mm_set_pd(1., 2.); + let b = _mm256_set_pd(3., 4., 5., 6.); + let r = _mm256_mask_broadcast_f64x2(b, 0b0110, a); + let e = _mm256_set_pd(3., 2., 1., 6.); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_broadcast_f64x2() { + let a = _mm_set_pd(1., 2.); + let r = _mm256_maskz_broadcast_f64x2(0b0110, a); + let e = _mm256_set_pd(0., 2., 1., 0.); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_broadcast_f64x2() { + let a = _mm_set_pd(1., 2.); + let r = _mm512_broadcast_f64x2(a); + let e = _mm512_set_pd(1., 2., 1., 2., 1., 2., 1., 2.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_broadcast_f64x2() { + let a = _mm_set_pd(1., 2.); + let b = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.); + let r = _mm512_mask_broadcast_f64x2(b, 0b01101001, a); + let e = _mm512_set_pd(3., 2., 1., 6., 1., 8., 9., 2.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_broadcast_f64x2() { + let a = _mm_set_pd(1., 2.); + let r = _mm512_maskz_broadcast_f64x2(0b01101001, a); + let e = _mm512_set_pd(0., 2., 1., 0., 1., 0., 0., 2.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm_broadcast_i32x2(a); + let e = _mm_set_epi32(1, 2, 1, 2); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm_mask_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let b = _mm_set_epi32(5, 6, 7, 8); + let r = _mm_mask_broadcast_i32x2(b, 0b0110, a); + let e = _mm_set_epi32(5, 2, 1, 6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm_maskz_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm_maskz_broadcast_i32x2(0b0110, a); + let e = _mm_set_epi32(0, 2, 1, 0); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm256_broadcast_i32x2(a); + let e = _mm256_set_epi32(1, 2, 1, 2, 1, 2, 1, 2); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let b = _mm256_set_epi32(5, 6, 7, 8, 9, 10, 11, 12); + let r = _mm256_mask_broadcast_i32x2(b, 0b01101001, a); + let e = _mm256_set_epi32(5, 2, 1, 6, 1, 10, 11, 2); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm256_maskz_broadcast_i32x2(0b01101001, a); + let e = _mm256_set_epi32(0, 2, 1, 0, 1, 0, 0, 2); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm512_broadcast_i32x2(a); + let e = _mm512_set_epi32(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let b = _mm512_set_epi32(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); + let r = _mm512_mask_broadcast_i32x2(b, 0b0110100100111100, a); + let e = _mm512_set_epi32(5, 2, 1, 8, 1, 10, 11, 2, 13, 14, 1, 2, 1, 2, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_broadcast_i32x2() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm512_maskz_broadcast_i32x2(0b0110100100111100, a); + let e = _mm512_set_epi32(0, 2, 1, 0, 1, 0, 0, 2, 0, 0, 1, 2, 1, 2, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_broadcast_i32x8() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm512_broadcast_i32x8(a); + let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_broadcast_i32x8() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi32( + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + ); + let r = _mm512_mask_broadcast_i32x8(b, 0b0110100100111100, a); + let e = _mm512_set_epi32(9, 2, 3, 12, 5, 14, 15, 8, 17, 18, 3, 4, 5, 6, 23, 24); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_broadcast_i32x8() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm512_maskz_broadcast_i32x8(0b0110100100111100, a); + let e = _mm512_set_epi32(0, 2, 3, 0, 5, 0, 0, 8, 0, 0, 3, 4, 5, 6, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_broadcast_i64x2() { + let a = _mm_set_epi64x(1, 2); + let r = _mm256_broadcast_i64x2(a); + let e = _mm256_set_epi64x(1, 2, 1, 2); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_broadcast_i64x2() { + let a = _mm_set_epi64x(1, 2); + let b = _mm256_set_epi64x(3, 4, 5, 6); + let r = _mm256_mask_broadcast_i64x2(b, 0b0110, a); + let e = _mm256_set_epi64x(3, 2, 1, 6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_broadcast_i64x2() { + let a = _mm_set_epi64x(1, 2); + let r = _mm256_maskz_broadcast_i64x2(0b0110, a); + let e = _mm256_set_epi64x(0, 2, 1, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_broadcast_i64x2() { + let a = _mm_set_epi64x(1, 2); + let r = _mm512_broadcast_i64x2(a); + let e = _mm512_set_epi64(1, 2, 1, 2, 1, 2, 1, 2); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_broadcast_i64x2() { + let a = _mm_set_epi64x(1, 2); + let b = _mm512_set_epi64(3, 4, 5, 6, 7, 8, 9, 10); + let r = _mm512_mask_broadcast_i64x2(b, 0b01101001, a); + let e = _mm512_set_epi64(3, 2, 1, 6, 1, 8, 9, 2); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_broadcast_i64x2() { + let a = _mm_set_epi64x(1, 2); + let r = _mm512_maskz_broadcast_i64x2(0b01101001, a); + let e = _mm512_set_epi64(0, 2, 1, 0, 1, 0, 0, 2); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_extractf32x8_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_extractf32x8_ps::<1>(a); + let e = _mm256_set_ps(9., 10., 11., 12., 13., 14., 15., 16.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_extractf32x8_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_mask_extractf32x8_ps::<1>(b, 0b01101001, a); + let e = _mm256_set_ps(17., 10., 11., 20., 13., 22., 23., 16.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_extractf32x8_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_maskz_extractf32x8_ps::<1>(0b01101001, a); + let e = _mm256_set_ps(0., 10., 11., 0., 13., 0., 0., 16.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_extractf64x2_pd() { + let a = _mm256_set_pd(1., 2., 3., 4.); + let r = _mm256_extractf64x2_pd::<1>(a); + let e = _mm_set_pd(3., 4.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_extractf64x2_pd() { + let a = _mm256_set_pd(1., 2., 3., 4.); + let b = _mm_set_pd(5., 6.); + let r = _mm256_mask_extractf64x2_pd::<1>(b, 0b01, a); + let e = _mm_set_pd(5., 4.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_extractf64x2_pd() { + let a = _mm256_set_pd(1., 2., 3., 4.); + let r = _mm256_maskz_extractf64x2_pd::<1>(0b01, a); + let e = _mm_set_pd(0., 4.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_extractf64x2_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_extractf64x2_pd::<2>(a); + let e = _mm_set_pd(5., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_extractf64x2_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm_set_pd(9., 10.); + let r = _mm512_mask_extractf64x2_pd::<2>(b, 0b01, a); + let e = _mm_set_pd(9., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_extractf64x2_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_maskz_extractf64x2_pd::<2>(0b01, a); + let e = _mm_set_pd(0., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_extracti32x8_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let r = _mm512_extracti32x8_epi32::<1>(a); + let e = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_extracti32x8_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_mask_extracti32x8_epi32::<1>(b, 0b01101001, a); + let e = _mm256_set_epi32(17, 10, 11, 20, 13, 22, 23, 16); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_extracti32x8_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let r = _mm512_maskz_extracti32x8_epi32::<1>(0b01101001, a); + let e = _mm256_set_epi32(0, 10, 11, 0, 13, 0, 0, 16); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_extracti64x2_epi64() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let r = _mm256_extracti64x2_epi64::<1>(a); + let e = _mm_set_epi64x(3, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_extracti64x2_epi64() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let b = _mm_set_epi64x(5, 6); + let r = _mm256_mask_extracti64x2_epi64::<1>(b, 0b01, a); + let e = _mm_set_epi64x(5, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_extracti64x2_epi64() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let r = _mm256_maskz_extracti64x2_epi64::<1>(0b01, a); + let e = _mm_set_epi64x(0, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_extracti64x2_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm512_extracti64x2_epi64::<2>(a); + let e = _mm_set_epi64x(5, 6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_extracti64x2_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_set_epi64x(9, 10); + let r = _mm512_mask_extracti64x2_epi64::<2>(b, 0b01, a); + let e = _mm_set_epi64x(9, 6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_extracti64x2_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm512_maskz_extracti64x2_epi64::<2>(0b01, a); + let e = _mm_set_epi64x(0, 6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_insertf32x8() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_insertf32x8::<1>(a, b); + let e = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 17., 18., 19., 20., 21., 22., 23., 24., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_insertf32x8() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.); + let src = _mm512_set_ps( + 25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., + ); + let r = _mm512_mask_insertf32x8::<1>(src, 0b0110100100111100, a, b); + let e = _mm512_set_ps( + 25., 2., 3., 28., 5., 30., 31., 8., 33., 34., 19., 20., 21., 22., 39., 40., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_insertf32x8() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_maskz_insertf32x8::<1>(0b0110100100111100, a, b); + let e = _mm512_set_ps( + 0., 2., 3., 0., 5., 0., 0., 8., 0., 0., 19., 20., 21., 22., 0., 0., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_insertf64x2() { + let a = _mm256_set_pd(1., 2., 3., 4.); + let b = _mm_set_pd(5., 6.); + let r = _mm256_insertf64x2::<1>(a, b); + let e = _mm256_set_pd(1., 2., 5., 6.); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_insertf64x2() { + let a = _mm256_set_pd(1., 2., 3., 4.); + let b = _mm_set_pd(5., 6.); + let src = _mm256_set_pd(7., 8., 9., 10.); + let r = _mm256_mask_insertf64x2::<1>(src, 0b0110, a, b); + let e = _mm256_set_pd(7., 2., 5., 10.); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_insertf64x2() { + let a = _mm256_set_pd(1., 2., 3., 4.); + let b = _mm_set_pd(5., 6.); + let r = _mm256_maskz_insertf64x2::<1>(0b0110, a, b); + let e = _mm256_set_pd(0., 2., 5., 0.); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_insertf64x2() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm_set_pd(9., 10.); + let r = _mm512_insertf64x2::<2>(a, b); + let e = _mm512_set_pd(1., 2., 3., 4., 9., 10., 7., 8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_insertf64x2() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm_set_pd(9., 10.); + let src = _mm512_set_pd(11., 12., 13., 14., 15., 16., 17., 18.); + let r = _mm512_mask_insertf64x2::<2>(src, 0b01101001, a, b); + let e = _mm512_set_pd(11., 2., 3., 14., 9., 16., 17., 8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_insertf64x2() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm_set_pd(9., 10.); + let r = _mm512_maskz_insertf64x2::<2>(0b01101001, a, b); + let e = _mm512_set_pd(0., 2., 3., 0., 9., 0., 0., 8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_inserti32x8() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_inserti32x8::<1>(a, b); + let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_inserti32x8() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24); + let src = _mm512_set_epi32( + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + ); + let r = _mm512_mask_inserti32x8::<1>(src, 0b0110100100111100, a, b); + let e = _mm512_set_epi32(25, 2, 3, 28, 5, 30, 31, 8, 33, 34, 19, 20, 21, 22, 39, 40); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_inserti32x8() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_maskz_inserti32x8::<1>(0b0110100100111100, a, b); + let e = _mm512_set_epi32(0, 2, 3, 0, 5, 0, 0, 8, 0, 0, 19, 20, 21, 22, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_inserti64x2() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let b = _mm_set_epi64x(5, 6); + let r = _mm256_inserti64x2::<1>(a, b); + let e = _mm256_set_epi64x(1, 2, 5, 6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_mask_inserti64x2() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let b = _mm_set_epi64x(5, 6); + let src = _mm256_set_epi64x(7, 8, 9, 10); + let r = _mm256_mask_inserti64x2::<1>(src, 0b0110, a, b); + let e = _mm256_set_epi64x(7, 2, 5, 10); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq,avx512vl")] + unsafe fn test_mm256_maskz_inserti64x2() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let b = _mm_set_epi64x(5, 6); + let r = _mm256_maskz_inserti64x2::<1>(0b0110, a, b); + let e = _mm256_set_epi64x(0, 2, 5, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_inserti64x2() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_set_epi64x(9, 10); + let r = _mm512_inserti64x2::<2>(a, b); + let e = _mm512_set_epi64(1, 2, 3, 4, 9, 10, 7, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_mask_inserti64x2() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_set_epi64x(9, 10); + let src = _mm512_set_epi64(11, 12, 13, 14, 15, 16, 17, 18); + let r = _mm512_mask_inserti64x2::<2>(src, 0b01101001, a, b); + let e = _mm512_set_epi64(11, 2, 3, 14, 9, 16, 17, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_mm512_maskz_inserti64x2() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_set_epi64x(9, 10); + let r = _mm512_maskz_inserti64x2::<2>(0b01101001, a, b); + let e = _mm512_set_epi64(0, 2, 3, 0, 9, 0, 0, 8); + assert_eq_m512i(r, e); + } }