From debe317dcf0802e8ce16ff56af45380ab7ce09d7 Mon Sep 17 00:00:00 2001
From: sayantn
Date: Mon, 15 Jul 2024 16:01:06 +0530
Subject: [PATCH] AVX512FP16 Part 5: FP-Support
`getexp`, `getmant`, `roundscale`, `scalef`, `reduce`
---
.../stdarch/crates/core_arch/missing-x86.md | 90 -
.../crates/core_arch/src/x86/avx512fp16.rs | 3309 ++++++++++++++++-
2 files changed, 3297 insertions(+), 102 deletions(-)
diff --git a/library/stdarch/crates/core_arch/missing-x86.md b/library/stdarch/crates/core_arch/missing-x86.md
index c0b8aa14572b..72fc8b840e27 100644
--- a/library/stdarch/crates/core_arch/missing-x86.md
+++ b/library/stdarch/crates/core_arch/missing-x86.md
@@ -103,10 +103,6 @@
* [ ] [`_mm512_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
* [ ] [`_mm512_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
* [ ] [`_mm512_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
- * [ ] [`_mm512_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
- * [ ] [`_mm512_getexp_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
- * [ ] [`_mm512_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
- * [ ] [`_mm512_getmant_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
* [ ] [`_mm512_mask_blend_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
* [ ] [`_mm512_mask_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
* [ ] [`_mm512_mask_cmp_round_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
@@ -155,16 +151,6 @@
* [ ] [`_mm512_mask_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
* [ ] [`_mm512_mask_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
* [ ] [`_mm512_mask_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
- * [ ] [`_mm512_mask_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
- * [ ] [`_mm512_mask_getexp_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
- * [ ] [`_mm512_mask_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
- * [ ] [`_mm512_mask_getmant_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
- * [ ] [`_mm512_mask_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
- * [ ] [`_mm512_mask_reduce_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
- * [ ] [`_mm512_mask_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
- * [ ] [`_mm512_mask_roundscale_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
- * [ ] [`_mm512_mask_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
- * [ ] [`_mm512_mask_scalef_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
* [ ] [`_mm512_maskz_cvt_roundepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
* [ ] [`_mm512_maskz_cvt_roundepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
* [ ] [`_mm512_maskz_cvt_roundepi64_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
@@ -209,28 +195,12 @@
* [ ] [`_mm512_maskz_cvtx_roundps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
* [ ] [`_mm512_maskz_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
* [ ] [`_mm512_maskz_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
- * [ ] [`_mm512_maskz_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
- * [ ] [`_mm512_maskz_getexp_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
- * [ ] [`_mm512_maskz_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
- * [ ] [`_mm512_maskz_getmant_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
- * [ ] [`_mm512_maskz_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
- * [ ] [`_mm512_maskz_reduce_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
- * [ ] [`_mm512_maskz_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
- * [ ] [`_mm512_maskz_roundscale_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
- * [ ] [`_mm512_maskz_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
- * [ ] [`_mm512_maskz_scalef_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
* [ ] [`_mm512_permutex2var_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
* [ ] [`_mm512_permutexvar_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
* [ ] [`_mm512_reduce_add_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
* [ ] [`_mm512_reduce_max_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
* [ ] [`_mm512_reduce_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
* [ ] [`_mm512_reduce_mul_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
- * [ ] [`_mm512_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
- * [ ] [`_mm512_reduce_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
- * [ ] [`_mm512_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
- * [ ] [`_mm512_roundscale_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
- * [ ] [`_mm512_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
- * [ ] [`_mm512_scalef_round_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
* [ ] [`_mm512_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pch)
* [ ] [`_mm_cvt_roundi32_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
* [ ] [`_mm_cvt_roundi64_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sh)
@@ -268,10 +238,6 @@
* [ ] [`_mm_cvtu32_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
* [ ] [`_mm_cvtu64_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sh)
* [ ] [`_mm_fpclass_sh_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
- * [ ] [`_mm_getexp_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
- * [ ] [`_mm_getexp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
- * [ ] [`_mm_getmant_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
- * [ ] [`_mm_getmant_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
* [ ] [`_mm_mask_cvt_roundsd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
* [ ] [`_mm_mask_cvt_roundsh_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
* [ ] [`_mm_mask_cvt_roundsh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
@@ -281,16 +247,6 @@
* [ ] [`_mm_mask_cvtsh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
* [ ] [`_mm_mask_cvtss_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
* [ ] [`_mm_mask_fpclass_sh_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
- * [ ] [`_mm_mask_getexp_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
- * [ ] [`_mm_mask_getexp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
- * [ ] [`_mm_mask_getmant_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
- * [ ] [`_mm_mask_getmant_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
- * [ ] [`_mm_mask_reduce_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
- * [ ] [`_mm_mask_reduce_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
- * [ ] [`_mm_mask_roundscale_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
- * [ ] [`_mm_mask_roundscale_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
- * [ ] [`_mm_mask_scalef_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
- * [ ] [`_mm_mask_scalef_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
* [ ] [`_mm_maskz_cvt_roundsd_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
* [ ] [`_mm_maskz_cvt_roundsh_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
* [ ] [`_mm_maskz_cvt_roundsh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
@@ -299,22 +255,6 @@
* [ ] [`_mm_maskz_cvtsh_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
* [ ] [`_mm_maskz_cvtsh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
* [ ] [`_mm_maskz_cvtss_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
- * [ ] [`_mm_maskz_getexp_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
- * [ ] [`_mm_maskz_getexp_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
- * [ ] [`_mm_maskz_getmant_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
- * [ ] [`_mm_maskz_getmant_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
- * [ ] [`_mm_maskz_reduce_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
- * [ ] [`_mm_maskz_reduce_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
- * [ ] [`_mm_maskz_roundscale_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
- * [ ] [`_mm_maskz_roundscale_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
- * [ ] [`_mm_maskz_scalef_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
- * [ ] [`_mm_maskz_scalef_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
- * [ ] [`_mm_reduce_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
- * [ ] [`_mm_reduce_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
- * [ ] [`_mm_roundscale_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
- * [ ] [`_mm_roundscale_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
- * [ ] [`_mm_scalef_round_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
- * [ ] [`_mm_scalef_sh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
* [ ] [`_mm_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pch)
@@ -345,8 +285,6 @@
* [ ] [`_mm256_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
* [ ] [`_mm256_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
* [ ] [`_mm256_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
- * [ ] [`_mm256_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
- * [ ] [`_mm256_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
* [ ] [`_mm256_mask_blend_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
* [ ] [`_mm256_mask_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
* [ ] [`_mm256_mask_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
@@ -372,11 +310,6 @@
* [ ] [`_mm256_mask_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
* [ ] [`_mm256_mask_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
* [ ] [`_mm256_mask_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
- * [ ] [`_mm256_mask_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
- * [ ] [`_mm256_mask_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
- * [ ] [`_mm256_mask_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
- * [ ] [`_mm256_mask_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
- * [ ] [`_mm256_mask_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
* [ ] [`_mm256_maskz_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
* [ ] [`_mm256_maskz_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
* [ ] [`_mm256_maskz_cvtepi64_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
@@ -399,20 +332,12 @@
* [ ] [`_mm256_maskz_cvttph_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
* [ ] [`_mm256_maskz_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
* [ ] [`_mm256_maskz_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
- * [ ] [`_mm256_maskz_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
- * [ ] [`_mm256_maskz_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
- * [ ] [`_mm256_maskz_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
- * [ ] [`_mm256_maskz_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
- * [ ] [`_mm256_maskz_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
* [ ] [`_mm256_permutex2var_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
* [ ] [`_mm256_permutexvar_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
* [ ] [`_mm256_reduce_add_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
* [ ] [`_mm256_reduce_max_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
* [ ] [`_mm256_reduce_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
* [ ] [`_mm256_reduce_mul_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
- * [ ] [`_mm256_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
- * [ ] [`_mm256_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
- * [ ] [`_mm256_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
* [ ] [`_mm_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
* [ ] [`_mm_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
* [ ] [`_mm_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
@@ -437,8 +362,6 @@
* [ ] [`_mm_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
* [ ] [`_mm_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
* [ ] [`_mm_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
- * [ ] [`_mm_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
- * [ ] [`_mm_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
* [ ] [`_mm_mask_blend_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
* [ ] [`_mm_mask_cmp_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
* [ ] [`_mm_mask_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
@@ -464,11 +387,6 @@
* [ ] [`_mm_mask_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
* [ ] [`_mm_mask_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
* [ ] [`_mm_mask_fpclass_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
- * [ ] [`_mm_mask_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
- * [ ] [`_mm_mask_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
- * [ ] [`_mm_mask_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
- * [ ] [`_mm_mask_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
- * [ ] [`_mm_mask_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
* [ ] [`_mm_maskz_cvtepi16_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
* [ ] [`_mm_maskz_cvtepi32_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
* [ ] [`_mm_maskz_cvtepi64_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
@@ -491,20 +409,12 @@
* [ ] [`_mm_maskz_cvttph_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
* [ ] [`_mm_maskz_cvtxph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
* [ ] [`_mm_maskz_cvtxps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
- * [ ] [`_mm_maskz_getexp_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
- * [ ] [`_mm_maskz_getmant_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
- * [ ] [`_mm_maskz_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
- * [ ] [`_mm_maskz_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
- * [ ] [`_mm_maskz_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
* [ ] [`_mm_permutex2var_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
* [ ] [`_mm_permutexvar_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
* [ ] [`_mm_reduce_add_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
* [ ] [`_mm_reduce_max_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
* [ ] [`_mm_reduce_min_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
* [ ] [`_mm_reduce_mul_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
- * [ ] [`_mm_reduce_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
- * [ ] [`_mm_roundscale_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
- * [ ] [`_mm_scalef_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
index b30bc63ed4d3..3c04d9ae9081 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
@@ -624,12 +624,13 @@ pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
#[target_feature(enable = "avx512fp16")]
#[rustc_legacy_const_generics(2, 3)]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmp_round_sh_mask(
+pub unsafe fn _mm_cmp_round_sh_mask(
a: __m128h,
b: __m128h,
) -> __mmask8 {
+ static_assert_uimm_bits!(IMM5, 5);
static_assert_sae!(SAE);
- _mm_mask_cmp_round_sh_mask::(0xff, a, b)
+ _mm_mask_cmp_round_sh_mask::(0xff, a, b)
}
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -641,13 +642,14 @@ pub unsafe fn _mm_cmp_round_sh_mask(
#[target_feature(enable = "avx512fp16")]
#[rustc_legacy_const_generics(3, 4)]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmp_round_sh_mask(
+pub unsafe fn _mm_mask_cmp_round_sh_mask(
k1: __mmask8,
a: __m128h,
b: __m128h,
) -> __mmask8 {
+ static_assert_uimm_bits!(IMM5, 5);
static_assert_sae!(SAE);
- vcmpsh(a, b, IMM8, k1, SAE)
+ vcmpsh(a, b, IMM5, k1, SAE)
}
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -658,8 +660,9 @@ pub unsafe fn _mm_mask_cmp_round_sh_mask(
#[target_feature(enable = "avx512fp16")]
#[rustc_legacy_const_generics(2)]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmp_sh_mask(a: __m128h, b: __m128h) -> __mmask8 {
- _mm_cmp_round_sh_mask::(a, b)
+pub unsafe fn _mm_cmp_sh_mask(a: __m128h, b: __m128h) -> __mmask8 {
+ static_assert_uimm_bits!(IMM5, 5);
+ _mm_cmp_round_sh_mask::(a, b)
}
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -670,12 +673,13 @@ pub unsafe fn _mm_cmp_sh_mask(a: __m128h, b: __m128h) -> __mmas
#[target_feature(enable = "avx512fp16")]
#[rustc_legacy_const_generics(3)]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmp_sh_mask(
+pub unsafe fn _mm_mask_cmp_sh_mask(
k1: __mmask8,
a: __m128h,
b: __m128h,
) -> __mmask8 {
- _mm_mask_cmp_round_sh_mask::(k1, a, b)
+ static_assert_uimm_bits!(IMM5, 5);
+ _mm_mask_cmp_round_sh_mask::(k1, a, b)
}
/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
@@ -706,9 +710,10 @@ pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
#[target_feature(enable = "avx512fp16")]
#[rustc_legacy_const_generics(2, 3)]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comi_round_sh(a: __m128h, b: __m128h) -> i32 {
+pub unsafe fn _mm_comi_round_sh(a: __m128h, b: __m128h) -> i32 {
+ static_assert_uimm_bits!(IMM5, 5);
static_assert_sae!(SAE);
- vcomish(a, b, IMM8, SAE)
+ vcomish(a, b, IMM5, SAE)
}
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -719,8 +724,9 @@ pub unsafe fn _mm_comi_round_sh(a: __m128h, b:
#[target_feature(enable = "avx512fp16")]
#[rustc_legacy_const_generics(2)]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comi_sh(a: __m128h, b: __m128h) -> i32 {
- _mm_comi_round_sh::(a, b)
+pub unsafe fn _mm_comi_sh(a: __m128h, b: __m128h) -> i32 {
+ static_assert_uimm_bits!(IMM5, 5);
+ _mm_comi_round_sh::(a, b)
}
/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
@@ -8440,6 +8446,2199 @@ pub unsafe fn _mm_maskz_min_round_sh(
_mm_mask_min_round_sh::(_mm_setzero_ph(), k, a, b)
}
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_getexp_ph(a: __m128h) -> __m128h {
+ _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+ vgetexpph_128(a, src, k)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
+ _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_getexp_ph(a: __m256h) -> __m256h {
+ _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+ vgetexpph_256(a, src, k)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
+ _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_getexp_ph(a: __m512h) -> __m512h {
+ _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+ _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
+ _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
+/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
+/// by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_getexp_round_ph(a: __m512h) -> __m512h {
+ static_assert_sae!(SAE);
+ _mm512_mask_getexp_round_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_getexp_round_ph(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_sae!(SAE);
+ vgetexpph_512(a, src, k, SAE)
+}
+
+/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
+/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
+/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_getexp_round_ph(k: __mmask32, a: __m512h) -> __m512h {
+ static_assert_sae!(SAE);
+ _mm512_mask_getexp_round_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
+/// calculates `floor(log2(x))` for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
+/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
+/// for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
+/// lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
+/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_getexp_round_sh(a: __m128h, b: __m128h) -> __m128h {
+ static_assert_sae!(SAE);
+ _mm_mask_getexp_round_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
+/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
+/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_getexp_round_sh(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_sae!(SAE);
+ vgetexpsh(a, b, src, k, SAE)
+}
+
+/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
+/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
+/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
+/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
+/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_getexp_round_sh(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_sae!(SAE);
+ _mm_mask_getexp_round_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ a: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm_mask_getmant_ph::(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ k: __mmask8,
+ a: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm_mask_getmant_ph::(_mm_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ a: __m256h,
+) -> __m256h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm256_mask_getmant_ph::(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_mask_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ src: __m256h,
+ k: __mmask16,
+ a: __m256h,
+) -> __m256h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_maskz_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ k: __mmask16,
+ a: __m256h,
+) -> __m256h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm256_mask_getmant_ph::(_mm256_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm512_mask_getmant_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm512_mask_getmant_round_ph::(src, k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_getmant_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm512_mask_getmant_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
+/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_getmant_round_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+ const SAE: i32,
+>(
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ static_assert_sae!(SAE);
+ _mm512_mask_getmant_round_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_getmant_round_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+ const SAE: i32,
+>(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ static_assert_sae!(SAE);
+ vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
+}
+
+/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
+/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_getmant_round_ph<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+ const SAE: i32,
+>(
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ static_assert_sae!(SAE);
+ _mm512_mask_getmant_round_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_getmant_sh<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm_mask_getmant_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_getmant_sh<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm_mask_getmant_round_sh::(src, k, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign.
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_getmant_sh<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ _mm_mask_getmant_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
+/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
+/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_getmant_round_sh<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+ const SAE: i32,
+>(
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ static_assert_sae!(SAE);
+ _mm_mask_getmant_round_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5, 6)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_getmant_round_sh<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+ const SAE: i32,
+>(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ static_assert_sae!(SAE);
+ vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
+}
+
+/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
+/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
+/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+///
+/// The sign is determined by sc which can take the following values:
+///
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_getmant_round_sh<
+ const NORM: _MM_MANTISSA_NORM_ENUM,
+ const SIGN: _MM_MANTISSA_SIGN_ENUM,
+ const SAE: i32,
+>(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(NORM, 4);
+ static_assert_uimm_bits!(SIGN, 2);
+ static_assert_sae!(SAE);
+ _mm_mask_getmant_round_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_roundscale_ph(a: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_roundscale_ph::(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_roundscale_ph(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ vrndscaleph_128(a, IMM8, src, k)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_roundscale_ph(k: __mmask8, a: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_roundscale_ph::(_mm_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_roundscale_ph(a: __m256h) -> __m256h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_roundscale_ph::(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_mask_roundscale_ph(
+ src: __m256h,
+ k: __mmask16,
+ a: __m256h,
+) -> __m256h {
+ static_assert_uimm_bits!(IMM8, 8);
+ vrndscaleph_256(a, IMM8, src, k)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_maskz_roundscale_ph(k: __mmask16, a: __m256h) -> __m256h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_roundscale_ph::(_mm256_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_roundscale_ph(a: __m512h) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_roundscale_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_roundscale_ph(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_roundscale_round_ph::(src, k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_roundscale_ph(k: __mmask32, a: __m512h) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_roundscale_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_roundscale_round_ph(a: __m512h) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_roundscale_round_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
+/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
+/// in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_roundscale_round_ph(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ vrndscaleph_512(a, IMM8, src, k, SAE)
+}
+
+/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
+/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_roundscale_round_ph(
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_roundscale_round_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
+/// from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_roundscale_sh(a: __m128h, b: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_roundscale_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
+/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_roundscale_sh(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_roundscale_round_sh::(src, k, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
+/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_roundscale_sh(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_roundscale_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
+/// from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_roundscale_round_sh(
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_roundscale_round_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
+/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_roundscale_round_sh(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ vrndscalesh(a, b, src, k, IMM8, SAE)
+}
+
+/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
+/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
+/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_roundscale_round_sh(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_roundscale_round_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ vscalefph_128(a, b, src, k)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
+ _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+ vscalefph_256(a, b, src, k)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+ _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
+ _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+ _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+ _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_scalef_round_ph(a: __m512h, b: __m512h) -> __m512h {
+ static_assert_rounding!(ROUNDING);
+ _mm512_mask_scalef_round_ph::(_mm512_undefined_ph(), 0xffffffff, a, b)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_scalef_round_ph(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+ b: __m512h,
+) -> __m512h {
+ static_assert_rounding!(ROUNDING);
+ vscalefph_512(a, b, src, k, ROUNDING)
+}
+
+/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
+/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_scalef_round_ph(
+ k: __mmask32,
+ a: __m512h,
+ b: __m512h,
+) -> __m512h {
+ static_assert_rounding!(ROUNDING);
+ _mm512_mask_scalef_round_ph::(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh))]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
+/// elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_scalef_round_sh(a: __m128h, b: __m128h) -> __m128h {
+ static_assert_rounding!(ROUNDING);
+ _mm_mask_scalef_round_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_scalef_round_sh(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_rounding!(ROUNDING);
+ vscalefsh(a, b, src, k, ROUNDING)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
+/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
+/// and copy the upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding parameter, which can be one of:
+///
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_scalef_round_sh(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_rounding!(ROUNDING);
+ _mm_mask_scalef_round_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_reduce_ph(a: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_ph::(_mm_undefined_ph(), 0xff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_reduce_ph(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ vreduceph_128(a, IMM8, src, k)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_reduce_ph(k: __mmask8, a: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_ph::(_mm_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_reduce_ph(a: __m256h) -> __m256h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_reduce_ph::(_mm256_undefined_ph(), 0xffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_mask_reduce_ph(
+ src: __m256h,
+ k: __mmask16,
+ a: __m256h,
+) -> __m256h {
+ static_assert_uimm_bits!(IMM8, 8);
+ vreduceph_256(a, IMM8, src, k)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16,avx512vl")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm256_maskz_reduce_ph(k: __mmask16, a: __m256h) -> __m256h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_reduce_ph::(_mm256_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_reduce_ph(a: __m512h) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_reduce_ph(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_round_ph::(src, k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_reduce_ph(k: __mmask32, a: __m512h) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_reduce_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(1, 2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_reduce_round_ph(a: __m512h) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_reduce_round_ph::(_mm512_undefined_ph(), 0xffffffff, a)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_mask_reduce_round_ph(
+ src: __m512h,
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ vreduceph_512(a, IMM8, src, k, SAE)
+}
+
+/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
+/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm512_maskz_reduce_round_ph(
+ k: __mmask32,
+ a: __m512h,
+) -> __m512h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm512_mask_reduce_round_ph::(_mm512_setzero_ph(), k, a)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
+/// upper 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_reduce_sh(a: __m128h, b: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
+/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
+/// a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_reduce_sh(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_round_sh::(src, k, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
+/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_reduce_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_reduce_sh::(_mm_setzero_ph(), k, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
+/// 7 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(2, 3)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_reduce_round_sh(
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_reduce_round_sh::(_mm_undefined_ph(), 0xff, a, b)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
+/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(4, 5)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_mask_reduce_round_sh(
+ src: __m128h,
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ vreducesh(a, b, src, k, IMM8, SAE)
+}
+
+/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
+/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
+/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
+/// to the upper elements of dst.
+///
+/// Rounding is done according to the imm8 parameter, which can be one of:
+///
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
+#[inline]
+#[target_feature(enable = "avx512fp16")]
+#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
+#[rustc_legacy_const_generics(3, 4)]
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub unsafe fn _mm_maskz_reduce_round_sh(
+ k: __mmask8,
+ a: __m128h,
+ b: __m128h,
+) -> __m128h {
+ static_assert_uimm_bits!(IMM8, 8);
+ static_assert_sae!(SAE);
+ _mm_mask_reduce_round_sh::(_mm_setzero_ph(), k, a, b)
+}
+
#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
@@ -8574,6 +10773,65 @@ extern "C" {
#[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
+ fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
+ fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
+ fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
+ fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
+
+ #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
+ fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
+ fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
+ fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+ #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
+ fn vgetmantsh(
+ a: __m128h,
+ b: __m128h,
+ imm8: i32,
+ src: __m128h,
+ k: __mmask8,
+ sae: i32,
+ ) -> __m128h;
+
+ #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
+ fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+ #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
+ fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+ #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
+ fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+ #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
+ fn vrndscalesh(
+ a: __m128h,
+ b: __m128h,
+ src: __m128h,
+ k: __mmask8,
+ imm8: i32,
+ sae: i32,
+ ) -> __m128h;
+
+ #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
+ fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
+ #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
+ fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
+ #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
+ fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
+ #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
+ fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
+
+ #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
+ fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
+ #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
+ fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
+ #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
+ fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
+ #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
+ fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
+ -> __m128h;
}
#[cfg(test)]
@@ -14469,4 +16727,1031 @@ mod tests {
let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
assert_eq_m128h(r, e);
}
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_getexp_ph() {
+ let a = _mm_set1_ph(3.0);
+ let r = _mm_getexp_ph(a);
+ let e = _mm_set1_ph(1.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_mask_getexp_ph() {
+ let a = _mm_set1_ph(3.0);
+ let src = _mm_set1_ph(4.0);
+ let r = _mm_mask_getexp_ph(src, 0b01010101, a);
+ let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_maskz_getexp_ph() {
+ let a = _mm_set1_ph(3.0);
+ let r = _mm_maskz_getexp_ph(0b01010101, a);
+ let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_getexp_ph() {
+ let a = _mm256_set1_ph(3.0);
+ let r = _mm256_getexp_ph(a);
+ let e = _mm256_set1_ph(1.0);
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_mask_getexp_ph() {
+ let a = _mm256_set1_ph(3.0);
+ let src = _mm256_set1_ph(4.0);
+ let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
+ let e = _mm256_set_ph(
+ 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_maskz_getexp_ph() {
+ let a = _mm256_set1_ph(3.0);
+ let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
+ let e = _mm256_set_ph(
+ 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_getexp_ph() {
+ let a = _mm512_set1_ph(3.0);
+ let r = _mm512_getexp_ph(a);
+ let e = _mm512_set1_ph(1.0);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_getexp_ph() {
+ let a = _mm512_set1_ph(3.0);
+ let src = _mm512_set1_ph(4.0);
+ let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
+ let e = _mm512_set_ph(
+ 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
+ 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_getexp_ph() {
+ let a = _mm512_set1_ph(3.0);
+ let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
+ let e = _mm512_set_ph(
+ 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+ 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_getexp_round_ph() {
+ let a = _mm512_set1_ph(3.0);
+ let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
+ let e = _mm512_set1_ph(1.0);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_getexp_round_ph() {
+ let a = _mm512_set1_ph(3.0);
+ let src = _mm512_set1_ph(4.0);
+ let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
+ src,
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
+ 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_getexp_round_ph() {
+ let a = _mm512_set1_ph(3.0);
+ let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+ 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_getexp_sh() {
+ let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_getexp_sh(a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_getexp_sh() {
+ let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_getexp_sh(src, 0, a, b);
+ let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_getexp_sh(src, 1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_getexp_sh() {
+ let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_getexp_sh(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_getexp_sh(1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_getexp_round_sh() {
+ let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_getexp_round_sh() {
+ let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
+ let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_getexp_round_sh() {
+ let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_getmant_ph() {
+ let a = _mm_set1_ph(10.0);
+ let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+ let e = _mm_set1_ph(1.25);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_mask_getmant_ph() {
+ let a = _mm_set1_ph(10.0);
+ let src = _mm_set1_ph(20.0);
+ let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
+ let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_maskz_getmant_ph() {
+ let a = _mm_set1_ph(10.0);
+ let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
+ let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_getmant_ph() {
+ let a = _mm256_set1_ph(10.0);
+ let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+ let e = _mm256_set1_ph(1.25);
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_mask_getmant_ph() {
+ let a = _mm256_set1_ph(10.0);
+ let src = _mm256_set1_ph(20.0);
+ let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+ src,
+ 0b0101010101010101,
+ a,
+ );
+ let e = _mm256_set_ph(
+ 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+ 20.0, 1.25,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_maskz_getmant_ph() {
+ let a = _mm256_set1_ph(10.0);
+ let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+ 0b0101010101010101,
+ a,
+ );
+ let e = _mm256_set_ph(
+ 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_getmant_ph() {
+ let a = _mm512_set1_ph(10.0);
+ let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
+ let e = _mm512_set1_ph(1.25);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_getmant_ph() {
+ let a = _mm512_set1_ph(10.0);
+ let src = _mm512_set1_ph(20.0);
+ let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+ src,
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+ 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+ 20.0, 1.25, 20.0, 1.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_getmant_ph() {
+ let a = _mm512_set1_ph(10.0);
+ let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+ 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_getmant_round_ph() {
+ let a = _mm512_set1_ph(10.0);
+ let r =
+ _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
+ a,
+ );
+ let e = _mm512_set1_ph(1.25);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_getmant_round_ph() {
+ let a = _mm512_set1_ph(10.0);
+ let src = _mm512_set1_ph(20.0);
+ let r = _mm512_mask_getmant_round_ph::<
+ _MM_MANT_NORM_P75_1P5,
+ _MM_MANT_SIGN_NAN,
+ _MM_FROUND_NO_EXC,
+ >(src, 0b01010101010101010101010101010101, a);
+ let e = _mm512_set_ph(
+ 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+ 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
+ 20.0, 1.25, 20.0, 1.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_getmant_round_ph() {
+ let a = _mm512_set1_ph(10.0);
+ let r = _mm512_maskz_getmant_round_ph::<
+ _MM_MANT_NORM_P75_1P5,
+ _MM_MANT_SIGN_NAN,
+ _MM_FROUND_NO_EXC,
+ >(0b01010101010101010101010101010101, a);
+ let e = _mm512_set_ph(
+ 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+ 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_getmant_sh() {
+ let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
+ let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_getmant_sh() {
+ let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
+ let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
+ let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_getmant_sh() {
+ let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
+ let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_getmant_round_sh() {
+ let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
+ a, b,
+ );
+ let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_getmant_round_sh() {
+ let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_getmant_round_sh::<
+ _MM_MANT_NORM_P75_1P5,
+ _MM_MANT_SIGN_NAN,
+ _MM_FROUND_NO_EXC,
+ >(src, 0, a, b);
+ let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_getmant_round_sh::<
+ _MM_MANT_NORM_P75_1P5,
+ _MM_MANT_SIGN_NAN,
+ _MM_FROUND_NO_EXC,
+ >(src, 1, a, b);
+ let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_getmant_round_sh() {
+ let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_getmant_round_sh::<
+ _MM_MANT_NORM_P75_1P5,
+ _MM_MANT_SIGN_NAN,
+ _MM_FROUND_NO_EXC,
+ >(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_getmant_round_sh::<
+ _MM_MANT_NORM_P75_1P5,
+ _MM_MANT_SIGN_NAN,
+ _MM_FROUND_NO_EXC,
+ >(1, a, b);
+ let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_roundscale_ph() {
+ let a = _mm_set1_ph(1.1);
+ let r = _mm_roundscale_ph::<0>(a);
+ let e = _mm_set1_ph(1.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_mask_roundscale_ph() {
+ let a = _mm_set1_ph(1.1);
+ let src = _mm_set1_ph(2.0);
+ let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
+ let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_maskz_roundscale_ph() {
+ let a = _mm_set1_ph(1.1);
+ let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
+ let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_roundscale_ph() {
+ let a = _mm256_set1_ph(1.1);
+ let r = _mm256_roundscale_ph::<0>(a);
+ let e = _mm256_set1_ph(1.0);
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_mask_roundscale_ph() {
+ let a = _mm256_set1_ph(1.1);
+ let src = _mm256_set1_ph(2.0);
+ let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
+ let e = _mm256_set_ph(
+ 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_maskz_roundscale_ph() {
+ let a = _mm256_set1_ph(1.1);
+ let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
+ let e = _mm256_set_ph(
+ 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_roundscale_ph() {
+ let a = _mm512_set1_ph(1.1);
+ let r = _mm512_roundscale_ph::<0>(a);
+ let e = _mm512_set1_ph(1.0);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_roundscale_ph() {
+ let a = _mm512_set1_ph(1.1);
+ let src = _mm512_set1_ph(2.0);
+ let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
+ let e = _mm512_set_ph(
+ 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+ 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_roundscale_ph() {
+ let a = _mm512_set1_ph(1.1);
+ let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
+ let e = _mm512_set_ph(
+ 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+ 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_roundscale_round_ph() {
+ let a = _mm512_set1_ph(1.1);
+ let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
+ let e = _mm512_set1_ph(1.0);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_roundscale_round_ph() {
+ let a = _mm512_set1_ph(1.1);
+ let src = _mm512_set1_ph(2.0);
+ let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
+ src,
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
+ 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_roundscale_round_ph() {
+ let a = _mm512_set1_ph(1.1);
+ let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+ 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_roundscale_sh() {
+ let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_roundscale_sh::<0>(a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_roundscale_sh() {
+ let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
+ let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_roundscale_sh() {
+ let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_roundscale_round_sh() {
+ let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_roundscale_round_sh() {
+ let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
+ let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_roundscale_round_sh() {
+ let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
+ let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_scalef_ph() {
+ let a = _mm_set1_ph(1.);
+ let b = _mm_set1_ph(3.);
+ let r = _mm_scalef_ph(a, b);
+ let e = _mm_set1_ph(8.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_mask_scalef_ph() {
+ let a = _mm_set1_ph(1.);
+ let b = _mm_set1_ph(3.);
+ let src = _mm_set1_ph(2.);
+ let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
+ let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_maskz_scalef_ph() {
+ let a = _mm_set1_ph(1.);
+ let b = _mm_set1_ph(3.);
+ let r = _mm_maskz_scalef_ph(0b01010101, a, b);
+ let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_scalef_ph() {
+ let a = _mm256_set1_ph(1.);
+ let b = _mm256_set1_ph(3.);
+ let r = _mm256_scalef_ph(a, b);
+ let e = _mm256_set1_ph(8.0);
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_mask_scalef_ph() {
+ let a = _mm256_set1_ph(1.);
+ let b = _mm256_set1_ph(3.);
+ let src = _mm256_set1_ph(2.);
+ let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
+ let e = _mm256_set_ph(
+ 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_maskz_scalef_ph() {
+ let a = _mm256_set1_ph(1.);
+ let b = _mm256_set1_ph(3.);
+ let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
+ let e = _mm256_set_ph(
+ 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_scalef_ph() {
+ let a = _mm512_set1_ph(1.);
+ let b = _mm512_set1_ph(3.);
+ let r = _mm512_scalef_ph(a, b);
+ let e = _mm512_set1_ph(8.0);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_scalef_ph() {
+ let a = _mm512_set1_ph(1.);
+ let b = _mm512_set1_ph(3.);
+ let src = _mm512_set1_ph(2.);
+ let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
+ let e = _mm512_set_ph(
+ 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
+ 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_scalef_ph() {
+ let a = _mm512_set1_ph(1.);
+ let b = _mm512_set1_ph(3.);
+ let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
+ let e = _mm512_set_ph(
+ 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
+ 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_scalef_round_ph() {
+ let a = _mm512_set1_ph(1.);
+ let b = _mm512_set1_ph(3.);
+ let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+ let e = _mm512_set1_ph(8.0);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_scalef_round_ph() {
+ let a = _mm512_set1_ph(1.);
+ let b = _mm512_set1_ph(3.);
+ let src = _mm512_set1_ph(2.);
+ let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+ src,
+ 0b01010101010101010101010101010101,
+ a,
+ b,
+ );
+ let e = _mm512_set_ph(
+ 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
+ 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_scalef_round_ph() {
+ let a = _mm512_set1_ph(1.);
+ let b = _mm512_set1_ph(3.);
+ let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+ 0b01010101010101010101010101010101,
+ a,
+ b,
+ );
+ let e = _mm512_set_ph(
+ 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
+ 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_scalef_sh() {
+ let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_scalef_sh(a, b);
+ let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_scalef_sh() {
+ let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_scalef_sh(src, 0, a, b);
+ let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_scalef_sh(src, 1, a, b);
+ let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_scalef_sh() {
+ let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_scalef_sh(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_scalef_sh(1, a, b);
+ let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_scalef_round_sh() {
+ let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+ let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_scalef_round_sh() {
+ let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+ src, 0, a, b,
+ );
+ let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+ src, 1, a, b,
+ );
+ let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_scalef_round_sh() {
+ let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
+ let r =
+ _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r =
+ _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
+ let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_reduce_ph() {
+ let a = _mm_set1_ph(1.25);
+ let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm_set1_ph(0.25);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_mask_reduce_ph() {
+ let a = _mm_set1_ph(1.25);
+ let src = _mm_set1_ph(2.0);
+ let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
+ let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm_maskz_reduce_ph() {
+ let a = _mm_set1_ph(1.25);
+ let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
+ let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_reduce_ph() {
+ let a = _mm256_set1_ph(1.25);
+ let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm256_set1_ph(0.25);
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_mask_reduce_ph() {
+ let a = _mm256_set1_ph(1.25);
+ let src = _mm256_set1_ph(2.0);
+ let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
+ let e = _mm256_set_ph(
+ 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16,avx512vl")]
+ unsafe fn test_mm256_maskz_reduce_ph() {
+ let a = _mm256_set1_ph(1.25);
+ let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
+ let e = _mm256_set_ph(
+ 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+ );
+ assert_eq_m256h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_reduce_ph() {
+ let a = _mm512_set1_ph(1.25);
+ let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
+ let e = _mm512_set1_ph(0.25);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_reduce_ph() {
+ let a = _mm512_set1_ph(1.25);
+ let src = _mm512_set1_ph(2.0);
+ let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
+ src,
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+ 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_reduce_ph() {
+ let a = _mm512_set1_ph(1.25);
+ let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+ 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_reduce_round_ph() {
+ let a = _mm512_set1_ph(1.25);
+ let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
+ let e = _mm512_set1_ph(0.25);
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_mask_reduce_round_ph() {
+ let a = _mm512_set1_ph(1.25);
+ let src = _mm512_set1_ph(2.0);
+ let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ src,
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+ 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm512_maskz_reduce_round_ph() {
+ let a = _mm512_set1_ph(1.25);
+ let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ 0b01010101010101010101010101010101,
+ a,
+ );
+ let e = _mm512_set_ph(
+ 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+ 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
+ );
+ assert_eq_m512h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_reduce_sh() {
+ let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
+ let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_reduce_sh() {
+ let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
+ let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
+ let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_reduce_sh() {
+ let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
+ let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_reduce_round_sh() {
+ let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+ let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
+ let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_mask_reduce_round_sh() {
+ let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+ let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
+ let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ src, 0, a, b,
+ );
+ let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
+ src, 1, a, b,
+ );
+ let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
+
+ #[simd_test(enable = "avx512fp16")]
+ unsafe fn test_mm_maskz_reduce_round_sh() {
+ let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
+ let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
+ let r =
+ _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
+ let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ let r =
+ _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
+ let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
+ assert_eq_m128h(r, e);
+ }
}