diff --git a/library/stdarch/crates/core_arch/avx512dq.md b/library/stdarch/crates/core_arch/avx512dq.md
index 9dcf8e21ae3f..807515f57e46 100644
--- a/library/stdarch/crates/core_arch/avx512dq.md
+++ b/library/stdarch/crates/core_arch/avx512dq.md
@@ -405,41 +405,41 @@
- Reduce:
- * [ ] _mm512_reduce_round_pd
- * [ ] _mm512_mask_reduce_round_pd
- * [ ] _mm512_maskz_reduce_round_pd
- * [ ] _mm_reduce_pd
- * [ ] _mm_mask_reduce_pd
- * [ ] _mm_maskz_reduce_pd
- * [ ] _mm256_reduce_pd
- * [ ] _mm256_mask_reduce_pd
- * [ ] _mm256_maskz_reduce_pd
- * [ ] _mm512_reduce_pd
- * [ ] _mm512_mask_reduce_pd
- * [ ] _mm512_maskz_reduce_pd
- * [ ] _mm512_reduce_round_ps
- * [ ] _mm512_mask_reduce_round_ps
- * [ ] _mm512_maskz_reduce_round_ps
- * [ ] _mm_reduce_ps
- * [ ] _mm_mask_reduce_ps
- * [ ] _mm_maskz_reduce_ps
- * [ ] _mm256_reduce_ps
- * [ ] _mm256_mask_reduce_ps
- * [ ] _mm256_maskz_reduce_ps
- * [ ] _mm512_reduce_ps
- * [ ] _mm512_mask_reduce_ps
- * [ ] _mm512_maskz_reduce_ps
- * [ ] _mm_reduce_round_sd
- * [ ] _mm_mask_reduce_round_sd
- * [ ] _mm_maskz_reduce_round_sd
- * [ ] _mm_reduce_sd
- * [ ] _mm_mask_reduce_sd
- * [ ] _mm_maskz_reduce_sd
- * [ ] _mm_reduce_round_ss
- * [ ] _mm_mask_reduce_round_ss
- * [ ] _mm_maskz_reduce_round_ss
- * [ ] _mm_reduce_ss
- * [ ] _mm_mask_reduce_ss
- * [ ] _mm_maskz_reduce_ss
+ * [x] _mm512_reduce_round_pd
+ * [x] _mm512_mask_reduce_round_pd
+ * [x] _mm512_maskz_reduce_round_pd
+ * [x] _mm_reduce_pd
+ * [x] _mm_mask_reduce_pd
+ * [x] _mm_maskz_reduce_pd
+ * [x] _mm256_reduce_pd
+ * [x] _mm256_mask_reduce_pd
+ * [x] _mm256_maskz_reduce_pd
+ * [x] _mm512_reduce_pd
+ * [x] _mm512_mask_reduce_pd
+ * [x] _mm512_maskz_reduce_pd
+ * [x] _mm512_reduce_round_ps
+ * [x] _mm512_mask_reduce_round_ps
+ * [x] _mm512_maskz_reduce_round_ps
+ * [x] _mm_reduce_ps
+ * [x] _mm_mask_reduce_ps
+ * [x] _mm_maskz_reduce_ps
+ * [x] _mm256_reduce_ps
+ * [x] _mm256_mask_reduce_ps
+ * [x] _mm256_maskz_reduce_ps
+ * [x] _mm512_reduce_ps
+ * [x] _mm512_mask_reduce_ps
+ * [x] _mm512_maskz_reduce_ps
+ * [x] _mm_reduce_round_sd
+ * [x] _mm_mask_reduce_round_sd
+ * [x] _mm_maskz_reduce_round_sd
+ * [x] _mm_reduce_sd
+ * [x] _mm_mask_reduce_sd
+ * [x] _mm_maskz_reduce_sd
+ * [x] _mm_reduce_round_ss
+ * [x] _mm_mask_reduce_round_ss
+ * [x] _mm_maskz_reduce_round_ss
+ * [x] _mm_reduce_ss
+ * [x] _mm_mask_reduce_ss
+ * [x] _mm_maskz_reduce_ss
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
index 7e7be9cf10af..f1c275fd61a5 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
@@ -1,4 +1,5 @@
use crate::{
+ arch::asm,
core_arch::{simd::*, x86::*},
intrinsics::simd::*,
mem::transmute,
@@ -1921,11 +1922,11 @@ pub unsafe fn _mm512_maskz_inserti64x2(
/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_pd&ig_expand=1437)
#[inline]
@@ -1942,11 +1943,11 @@ pub unsafe fn _mm512_cvt_roundepi64_pd(a: __m512i) -> __m51
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_pd&ig_expand=1438)
#[inline]
@@ -1968,11 +1969,11 @@ pub unsafe fn _mm512_mask_cvt_roundepi64_pd(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_pd&ig_expand=1439)
#[inline]
@@ -2113,11 +2114,11 @@ pub unsafe fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepi64_ps&ig_expand=1443)
#[inline]
@@ -2134,11 +2135,11 @@ pub unsafe fn _mm512_cvt_roundepi64_ps(a: __m512i) -> __m25
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepi64_ps&ig_expand=1444)
#[inline]
@@ -2160,11 +2161,11 @@ pub unsafe fn _mm512_mask_cvt_roundepi64_ps(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepi64_ps&ig_expand=1445)
#[inline]
@@ -2302,11 +2303,11 @@ pub unsafe fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_pd&ig_expand=1455)
#[inline]
@@ -2323,11 +2324,11 @@ pub unsafe fn _mm512_cvt_roundepu64_pd(a: __m512i) -> __m51
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_pd&ig_expand=1456)
#[inline]
@@ -2349,11 +2350,11 @@ pub unsafe fn _mm512_mask_cvt_roundepu64_pd(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_pd&ig_expand=1457)
#[inline]
@@ -2494,11 +2495,11 @@ pub unsafe fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundepu64_ps&ig_expand=1461)
#[inline]
@@ -2515,11 +2516,11 @@ pub unsafe fn _mm512_cvt_roundepu64_ps(a: __m512i) -> __m25
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundepu64_ps&ig_expand=1462)
#[inline]
@@ -2541,11 +2542,11 @@ pub unsafe fn _mm512_mask_cvt_roundepu64_ps(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundepu64_ps&ig_expand=1463)
#[inline]
@@ -2683,11 +2684,11 @@ pub unsafe fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epi64&ig_expand=1472)
#[inline]
@@ -2704,11 +2705,11 @@ pub unsafe fn _mm512_cvt_roundpd_epi64(a: __m512d) -> __m51
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epi64&ig_expand=1473)
#[inline]
@@ -2729,11 +2730,11 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epi64(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epi64&ig_expand=1474)
#[inline]
@@ -2868,11 +2869,11 @@ pub unsafe fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
/// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi64&ig_expand=1514)
#[inline]
@@ -2889,11 +2890,11 @@ pub unsafe fn _mm512_cvt_roundps_epi64(a: __m256) -> __m512
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi64&ig_expand=1515)
#[inline]
@@ -2914,11 +2915,11 @@ pub unsafe fn _mm512_mask_cvt_roundps_epi64(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi64&ig_expand=1516)
#[inline]
@@ -3053,11 +3054,11 @@ pub unsafe fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundpd_epu64&ig_expand=1478)
#[inline]
@@ -3074,11 +3075,11 @@ pub unsafe fn _mm512_cvt_roundpd_epu64(a: __m512d) -> __m51
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundpd_epu64&ig_expand=1479)
#[inline]
@@ -3099,11 +3100,11 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epu64(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundpd_epu64&ig_expand=1480)
#[inline]
@@ -3238,11 +3239,11 @@ pub unsafe fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
/// and store the results in dst. Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu64&ig_expand=1520)
#[inline]
@@ -3259,11 +3260,11 @@ pub unsafe fn _mm512_cvt_roundps_epu64(a: __m256) -> __m512
/// and store the results in dst using writemask k (elements are copied from src if the corresponding bit is
/// not set). Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu64&ig_expand=1521)
#[inline]
@@ -3284,11 +3285,11 @@ pub unsafe fn _mm512_mask_cvt_roundps_epu64(
/// and store the results in dst using zeromask k (elements are zeroed out if the corresponding bit is not set).
/// Rounding is done according to the ROUNDING parameter, which can be one of:
///
-/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
+/// - (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// - (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// - (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// - (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC
///
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu64&ig_expand=1522)
#[inline]
@@ -4626,6 +4627,8 @@ pub unsafe fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
_mm512_maskz_mov_epi64(k, ones)
}
+// Range
+
/// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
/// double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
/// Lower 2 bits of IMM8 specifies the operation control:
@@ -4750,7 +4753,7 @@ pub unsafe fn _mm_mask_range_pd(
a.as_f64x2(),
b.as_f64x2(),
IMM8,
- src.as_f64x4(),
+ src.as_f64x2(),
k,
))
}
@@ -5467,6 +5470,944 @@ pub unsafe fn _mm_maskz_range_ss(k: __mmask8, a: __m128, b: __m
_mm_mask_range_ss::(_mm_setzero_ps(), k, a, b)
}
+// Reduce
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_pd&ig_expand=5438)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_reduce_round_pd(a: __m512d) -> __m512d {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_reduce_round_pd::(_mm512_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_pd&ig_expand=5436)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_reduce_round_pd(
+ src: __m512d,
+ k: __mmask8,
+ a: __m512d,
+) -> __m512d {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ transmute(vreducepd_512(a.as_f64x8(), IMM8, src.as_f64x8(), k, SAE))
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_pd&ig_expand=5437)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_maskz_reduce_round_pd(
+ k: __mmask8,
+ a: __m512d,
+) -> __m512d {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_reduce_round_pd::(_mm512_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_pd&ig_expand=5411)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_pd(a: __m128d) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_pd::(_mm_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_pd&ig_expand=5409)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_pd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreducepd_128(a.as_f64x2(), IMM8, src.as_f64x2(), k))
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_pd&ig_expand=5410)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_reduce_pd(k: __mmask8, a: __m128d) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_pd::(_mm_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_pd&ig_expand=5414)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_pd(a: __m256d) -> __m256d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_reduce_pd::(_mm256_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_pd&ig_expand=5412)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_pd(
+ src: __m256d,
+ k: __mmask8,
+ a: __m256d,
+) -> __m256d {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreducepd_256(a.as_f64x4(), IMM8, src.as_f64x4(), k))
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_pd&ig_expand=5413)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_maskz_reduce_pd(k: __mmask8, a: __m256d) -> __m256d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_reduce_pd::(_mm256_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_pd&ig_expand=5417)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_reduce_pd(a: __m512d) -> __m512d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_pd::(_mm512_undefined_pd(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_pd&ig_expand=5415)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_reduce_pd(
+ src: __m512d,
+ k: __mmask8,
+ a: __m512d,
+) -> __m512d {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreducepd_512(
+ a.as_f64x8(),
+ IMM8,
+ src.as_f64x8(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_pd&ig_expand=5416)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_maskz_reduce_pd(k: __mmask8, a: __m512d) -> __m512d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_pd::(_mm512_setzero_pd(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_round_ps&ig_expand=5444)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_reduce_round_ps(a: __m512) -> __m512 {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_reduce_round_ps::(_mm512_undefined_ps(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_round_ps&ig_expand=5442)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_reduce_round_ps(
+ src: __m512,
+ k: __mmask16,
+ a: __m512,
+) -> __m512 {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ transmute(vreduceps_512(a.as_f32x16(), IMM8, src.as_f32x16(), k, SAE))
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_round_ps&ig_expand=5443)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_maskz_reduce_round_ps(
+ k: __mmask16,
+ a: __m512,
+) -> __m512 {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_reduce_round_ps::(_mm512_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ps&ig_expand=5429)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_ps(a: __m128) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_ps::(_mm_undefined_ps(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ps&ig_expand=5427)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreduceps_128(a.as_f32x4(), IMM8, src.as_f32x4(), k))
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ps&ig_expand=5428)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_reduce_ps(k: __mmask8, a: __m128) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_ps::(_mm_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_ps&ig_expand=5432)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_ps(a: __m256) -> __m256 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_reduce_ps::(_mm256_undefined_ps(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_ps&ig_expand=5430)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_ps(
+ src: __m256,
+ k: __mmask8,
+ a: __m256,
+) -> __m256 {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreduceps_256(a.as_f32x8(), IMM8, src.as_f32x8(), k))
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_reduce_ps&ig_expand=5431)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_maskz_reduce_ps(k: __mmask8, a: __m256) -> __m256 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_reduce_ps::(_mm256_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_reduce_ps&ig_expand=5435)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_reduce_ps(a: __m512) -> __m512 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_ps::(_mm512_undefined_ps(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using writemask k (elements are
+/// copied from src to dst if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_reduce_ps&ig_expand=5433)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_reduce_ps(
+ src: __m512,
+ k: __mmask16,
+ a: __m512,
+) -> __m512 {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreduceps_512(
+ a.as_f32x16(),
+ IMM8,
+ src.as_f32x16(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
+/// the number of bits specified by imm8, and store the results in dst using zeromask k (elements are
+/// zeroed out if the corresponding mask bit is not set).
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_reduce_ps&ig_expand=5434)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_maskz_reduce_ps(k: __mmask16, a: __m512) -> __m512 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_ps::(_mm512_setzero_ps(), k, a)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_sd&ig_expand=5447)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_round_sd(
+ a: __m128d,
+ b: __m128d,
+) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_reduce_round_sd::(_mm_undefined_pd(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_sd&ig_expand=5445)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ transmute(vreducesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ IMM8,
+ SAE,
+ ))
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_sd&ig_expand=5446)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_reduce_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_reduce_round_sd::(_mm_setzero_pd(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using, and
+/// copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_sd&ig_expand=5456)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_sd(a: __m128d, b: __m128d) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_sd::(_mm_undefined_pd(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_sd&ig_expand=5454)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreducesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ IMM8,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_sd&ig_expand=5455)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_reduce_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_sd::(_mm_setzero_pd(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_round_ss&ig_expand=5453)
+#[inline]
+#[target_feature(enable = "avx512dq")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_round_ss(a: __m128, b: __m128) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_reduce_round_ss::(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_round_ss&ig_expand=5451)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ transmute(vreducess(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ IMM8,
+ SAE,
+ ))
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_round_ss&ig_expand=5452)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_reduce_round_ss(
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_reduce_round_ss::(_mm_setzero_ps(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst, and copy
+/// the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_ss&ig_expand=5462)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_ss(a: __m128, b: __m128) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_ss::(_mm_undefined_ps(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using writemask
+/// k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_ss&ig_expand=5460)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ transmute(vreducess(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ IMM8,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
+/// by the number of bits specified by imm8, store the result in the lower element of dst using zeromask
+/// k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a.
+/// to the upper element of dst.
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// - _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// - _MM_FROUND_TO_NEG_INF // round down
+/// - _MM_FROUND_TO_POS_INF // round up
+/// - _MM_FROUND_TO_ZERO // truncate
+/// - _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_reduce_ss&ig_expand=5461)
+#[inline]
+#[target_feature(enable = "avx512dq,avx512vl")]
+#[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_reduce_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_ss::(_mm_setzero_ps(), k, a, b)
+}
+
#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx512.sitofp.round.v2f64.v2i64"]
@@ -8540,29 +9481,50 @@ mod tests {
#[simd_test(enable = "avx512dq")]
unsafe fn test_mm512_range_round_ps() {
- let a = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.);
- let b = _mm512_set_ps(2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.);
+ let a = _mm512_set_ps(
+ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let b = _mm512_set_ps(
+ 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+ );
let r = _mm512_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(a, b);
- let e = _mm512_set_ps(2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.);
+ let e = _mm512_set_ps(
+ 2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+ );
assert_eq_m512(r, e);
}
#[simd_test(enable = "avx512dq")]
unsafe fn test_mm512_mask_range_round_ps() {
- let a = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.);
- let b = _mm512_set_ps(2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.);
- let c = _mm512_set_ps(17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.);
- let r = _mm512_mask_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0110100100111100, a, b);
- let e = _mm512_set_ps(17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.);
+ let a = _mm512_set_ps(
+ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let b = _mm512_set_ps(
+ 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+ );
+ let c = _mm512_set_ps(
+ 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+ );
+ let r =
+ _mm512_mask_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(c, 0b0110100100111100, a, b);
+ let e = _mm512_set_ps(
+ 17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
+ );
assert_eq_m512(r, e);
}
#[simd_test(enable = "avx512dq")]
unsafe fn test_mm512_maskz_range_round_ps() {
- let a = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.);
- let b = _mm512_set_ps(2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.);
+ let a = _mm512_set_ps(
+ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let b = _mm512_set_ps(
+ 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+ );
let r = _mm512_maskz_range_round_ps::<0b0101, _MM_FROUND_NO_EXC>(0b0110100100111100, a, b);
- let e = _mm512_set_ps(0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.);
+ let e = _mm512_set_ps(
+ 0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
+ );
assert_eq_m512(r, e);
}
@@ -8624,29 +9586,49 @@ mod tests {
#[simd_test(enable = "avx512dq")]
unsafe fn test_mm512_range_ps() {
- let a = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.);
- let b = _mm512_set_ps(2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.);
+ let a = _mm512_set_ps(
+ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let b = _mm512_set_ps(
+ 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+ );
let r = _mm512_range_ps::<0b0101>(a, b);
- let e = _mm512_set_ps(2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.);
+ let e = _mm512_set_ps(
+ 2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
+ );
assert_eq_m512(r, e);
}
#[simd_test(enable = "avx512dq")]
unsafe fn test_mm512_mask_range_ps() {
- let a = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.);
- let b = _mm512_set_ps(2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.);
- let c = _mm512_set_ps(17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.);
+ let a = _mm512_set_ps(
+ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let b = _mm512_set_ps(
+ 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+ );
+ let c = _mm512_set_ps(
+ 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+ );
let r = _mm512_mask_range_ps::<0b0101>(c, 0b0110100100111100, a, b);
- let e = _mm512_set_ps(17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.);
+ let e = _mm512_set_ps(
+ 17., 2., 4., 20., 6., 22., 23., 8., 25., 26., 12., 12., 14., 14., 31., 32.,
+ );
assert_eq_m512(r, e);
}
#[simd_test(enable = "avx512dq")]
unsafe fn test_mm512_maskz_range_ps() {
- let a = _mm512_set_ps(1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.);
- let b = _mm512_set_ps(2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.);
+ let a = _mm512_set_ps(
+ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let b = _mm512_set_ps(
+ 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16., 15.,
+ );
let r = _mm512_maskz_range_ps::<0b0101>(0b0110100100111100, a, b);
- let e = _mm512_set_ps(0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.);
+ let e = _mm512_set_ps(
+ 0., 2., 4., 0., 6., 0., 0., 8., 0., 0., 12., 12., 14., 14., 0., 0.,
+ );
assert_eq_m512(r, e);
}
@@ -8744,4 +9726,366 @@ mod tests {
assert_eq_m128(r, e);
}
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_reduce_round_pd() {
+ let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let r = _mm512_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+ let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_mask_reduce_round_pd() {
+ let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+ let r = _mm512_mask_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ src, 0b01101001, a,
+ );
+ let e = _mm512_set_pd(3., 0., 0.25, 4., 0.25, 6., 7., 0.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_maskz_reduce_round_pd() {
+ let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let r = _mm512_maskz_reduce_round_pd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ 0b01101001, a,
+ );
+ let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_reduce_pd() {
+ let a = _mm_set_pd(0.25, 0.50);
+ let r = _mm_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm_set_pd(0.25, 0.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_mask_reduce_pd() {
+ let a = _mm_set_pd(0.25, 0.50);
+ let src = _mm_set_pd(3., 4.);
+ let r = _mm_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01, a);
+ let e = _mm_set_pd(3., 0.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_pd() {
+ let a = _mm_set_pd(0.25, 0.50);
+ let r = _mm_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01, a);
+ let e = _mm_set_pd(0., 0.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm256_reduce_pd() {
+ let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+ let r = _mm256_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm256_set_pd(0.25, 0., 0.25, 0.);
+ assert_eq_m256d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm256_mask_reduce_pd() {
+ let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+ let src = _mm256_set_pd(3., 4., 5., 6.);
+ let r = _mm256_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
+ let e = _mm256_set_pd(3., 0., 0.25, 4.);
+ assert_eq_m256d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm256_maskz_reduce_pd() {
+ let a = _mm256_set_pd(0.25, 0.50, 0.75, 1.0);
+ let r = _mm256_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
+ let e = _mm256_set_pd(0., 0., 0.25, 0.);
+ assert_eq_m256d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_reduce_pd() {
+ let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let r = _mm512_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm512_set_pd(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_mask_reduce_pd() {
+ let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let src = _mm512_set_pd(3., 4., 5., 6., 7., 8., 9., 10.);
+ let r = _mm512_mask_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
+ let e = _mm512_set_pd(3., 0., 0.25, 4., 0.25, 6., 7., 0.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_maskz_reduce_pd() {
+ let a = _mm512_set_pd(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let r = _mm512_maskz_reduce_pd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
+ let e = _mm512_set_pd(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_reduce_round_ps() {
+ let a = _mm512_set_ps(
+ 0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+ 4.0,
+ );
+ let r = _mm512_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+ let e = _mm512_set_ps(
+ 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_mask_reduce_round_ps() {
+ let a = _mm512_set_ps(
+ 0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+ 4.0,
+ );
+ let src = _mm512_set_ps(
+ 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+ );
+ let r = _mm512_mask_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ src,
+ 0b0110100100111100,
+ a,
+ );
+ let e = _mm512_set_ps(
+ 5., 0., 0.25, 6., 0.25, 8., 9., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_maskz_reduce_round_ps() {
+ let a = _mm512_set_ps(
+ 0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+ 4.0,
+ );
+ let r = _mm512_maskz_reduce_round_ps::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ 0b0110100100111100,
+ a,
+ );
+ let e = _mm512_set_ps(
+ 0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_reduce_ps() {
+ let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+ let r = _mm_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm_set_ps(0.25, 0., 0.25, 0.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_mask_reduce_ps() {
+ let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+ let src = _mm_set_ps(2., 3., 4., 5.);
+ let r = _mm_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110, a);
+ let e = _mm_set_ps(2., 0., 0.25, 3.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_ps() {
+ let a = _mm_set_ps(0.25, 0.50, 0.75, 1.0);
+ let r = _mm_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110, a);
+ let e = _mm_set_ps(0., 0., 0.25, 0.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm256_reduce_ps() {
+ let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let r = _mm256_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm256_set_ps(0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm256_mask_reduce_ps() {
+ let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let src = _mm256_set_ps(3., 4., 5., 6., 7., 8., 9., 10.);
+ let r = _mm256_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01101001, a);
+ let e = _mm256_set_ps(3., 0., 0.25, 4., 0.25, 6., 7., 0.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm256_maskz_reduce_ps() {
+ let a = _mm256_set_ps(0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0);
+ let r = _mm256_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01101001, a);
+ let e = _mm256_set_ps(0., 0., 0.25, 0., 0.25, 0., 0., 0.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_reduce_ps() {
+ let a = _mm512_set_ps(
+ 0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+ 4.0,
+ );
+ let r = _mm512_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm512_set_ps(
+ 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0., 0.25, 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_mask_reduce_ps() {
+ let a = _mm512_set_ps(
+ 0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+ 4.0,
+ );
+ let src = _mm512_set_ps(
+ 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.,
+ );
+ let r = _mm512_mask_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0110100100111100, a);
+ let e = _mm512_set_ps(
+ 5., 0., 0.25, 6., 0.25, 8., 9., 0., 13., 14., 0.25, 0., 0.25, 0., 19., 20.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq")]
+ unsafe fn test_mm512_maskz_reduce_ps() {
+ let a = _mm512_set_ps(
+ 0.25, 0.50, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0, 2.25, 2.50, 2.75, 3.0, 3.25, 3.50, 3.75,
+ 4.0,
+ );
+ let r = _mm512_maskz_reduce_ps::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0110100100111100, a);
+ let e = _mm512_set_ps(
+ 0., 0., 0.25, 0., 0.25, 0., 0., 0., 0., 0., 0.25, 0., 0.25, 0., 0., 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_reduce_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_sd(0.25);
+ let r = _mm_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+ let e = _mm_set_pd(1., 0.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_mask_reduce_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_sd(0.25);
+ let c = _mm_set_pd(3., 4.);
+ let r = _mm_mask_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ c, 0b0, a, b,
+ );
+ let e = _mm_set_pd(1., 4.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_sd(0.25);
+ let r =
+ _mm_maskz_reduce_round_sd::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_reduce_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_sd(0.25);
+ let r = _mm_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+ let e = _mm_set_pd(1., 0.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_mask_reduce_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_sd(0.25);
+ let c = _mm_set_pd(3., 4.);
+ let r = _mm_mask_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
+ let e = _mm_set_pd(1., 4.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_sd(0.25);
+ let r = _mm_maskz_reduce_sd::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_reduce_round_ss() {
+ let a = _mm_set_ps(1., 2., 3., 4.);
+ let b = _mm_set_ss(0.25);
+ let r = _mm_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+ let e = _mm_set_ps(1., 2., 3., 0.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_mask_reduce_round_ss() {
+ let a = _mm_set_ps(1., 2., 3., 4.);
+ let b = _mm_set_ss(0.25);
+ let c = _mm_set_ps(5., 6., 7., 8.);
+ let r = _mm_mask_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ c, 0b0, a, b,
+ );
+ let e = _mm_set_ps(1., 2., 3., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_round_ss() {
+ let a = _mm_set_ps(1., 2., 3., 4.);
+ let b = _mm_set_ss(0.25);
+ let r =
+ _mm_maskz_reduce_round_ss::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0b0, a, b);
+ let e = _mm_set_ps(1., 2., 3., 0.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_reduce_ss() {
+ let a = _mm_set_ps(1., 2., 3., 4.);
+ let b = _mm_set_ss(0.25);
+ let r = _mm_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+ let e = _mm_set_ps(1., 2., 3., 0.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_mask_reduce_ss() {
+ let a = _mm_set_ps(1., 2., 3., 4.);
+ let b = _mm_set_ss(0.25);
+ let c = _mm_set_ps(5., 6., 7., 8.);
+ let r = _mm_mask_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(c, 0b0, a, b);
+ let e = _mm_set_ps(1., 2., 3., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512dq,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_ss() {
+ let a = _mm_set_ps(1., 2., 3., 4.);
+ let b = _mm_set_ss(0.25);
+ let r = _mm_maskz_reduce_ss::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0, a, b);
+ let e = _mm_set_ps(1., 2., 3., 0.);
+ assert_eq_m128(r, e);
+ }
}