From 936e1add97be31c41ebbd4cf9110e9c042fd168e Mon Sep 17 00:00:00 2001
From: kangshan1157 <shan.kang@intel.com>
Date: Thu, 11 Feb 2021 07:29:27 +0800
Subject: [PATCH] Implement avx512bf16 intrinsics (#998)

---
 .../docker/i586-unknown-linux-gnu/Dockerfile  |    2 +-
 .../docker/i686-unknown-linux-gnu/Dockerfile  |    2 +-
 .../Dockerfile                                |    3 +-
 .../x86_64-unknown-linux-gnu/Dockerfile       |    2 +-
 library/stdarch/ci/run.sh                     |    3 +
 .../crates/core_arch/src/x86/avx512bf16.rs    | 1018 +++++++++++++++++
 .../stdarch/crates/core_arch/src/x86/mod.rs   |  133 +++
 .../stdarch/crates/stdarch-verify/src/lib.rs  |    3 +
 .../crates/stdarch-verify/tests/x86-intel.rs  |   18 +
 9 files changed, 1180 insertions(+), 4 deletions(-)
 create mode 100644 library/stdarch/crates/core_arch/src/x86/avx512bf16.rs

diff --git a/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
index 01093698f679..0e4d1c6eb466 100644
--- a/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/i586-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 RUN apt-get update && apt-get install -y --no-install-recommends \
   gcc-multilib \
   libc6-dev \
diff --git a/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
index 01093698f679..0e4d1c6eb466 100644
--- a/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/i686-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 RUN apt-get update && apt-get install -y --no-install-recommends \
   gcc-multilib \
   libc6-dev \
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
index 40dbebdcc91d..b7fc930523e7 100644
--- a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 RUN apt-get update && apt-get install -y --no-install-recommends \
   gcc \
   libc6-dev \
@@ -10,4 +10,5 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.35.0-2019-03-11-lin.tar.bz2
 RUN tar -xjf sde-external-8.35.0-2019-03-11-lin.tar.bz2
+ENV SKIP_TESTS="avx512bf16"
 ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.35.0-2019-03-11-lin/sde64 -rtm_mode full --"
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
index db64f7f915ee..dc4c4e59897d 100644
--- a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 RUN apt-get update && apt-get install -y --no-install-recommends \
   gcc \
   libc6-dev \
diff --git a/library/stdarch/ci/run.sh b/library/stdarch/ci/run.sh
index 4f4eaa1cfcd9..699c89cecb10 100755
--- a/library/stdarch/ci/run.sh
+++ b/library/stdarch/ci/run.sh
@@ -56,6 +56,9 @@ cargo_test() {
             ;;
     esac
 
+    if [ "$SKIP_TESTS" != "" ]; then
+        cmd="$cmd --skip "$SKIP_TESTS
+    fi
     $cmd
 }
 
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
new file mode 100644
index 000000000000..da04b703427f
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
@@ -0,0 +1,1018 @@
+//! [AVX512BF16 intrinsics].
+//!
+//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
+
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
+    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
+    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
+    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
+    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
+    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
+    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
+    fn dpbf16ps(a: f32x4, b: i32x4, c: i32x4) -> f32x4;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
+    fn dpbf16ps_256(a: f32x8, b: i32x8, c: i32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
+    fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16;
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a 
+/// 128-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
+    transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
+/// in single vector dst using writemask k (elements are copied from src when the 
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding 
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm_maskz_cvtne2ps_pbh (k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a 
+/// 256-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm256_cvtne2ps_pbh (a: __m256, b: __m256) -> __m256bh {
+    transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b 
+/// to packed BF16 (16-bit) floating-point elements and and store the results in single vector
+/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm256_mask_cvtne2ps_pbh (src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
+/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector 
+/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm256_maskz_cvtne2ps_pbh (k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a  
+/// 512-bit wide vector.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm512_cvtne2ps_pbh (a: __m512, b: __m512) -> __m512bh {
+    transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
+/// in single vector dst using writemask k (elements are copied from src when the 
+/// corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm512_mask_cvtne2ps_pbh (src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding 
+/// mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
+pub unsafe fn _mm512_maskz_cvtne2ps_pbh (k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+    let zero = _mm512_setzero_si512().as_u16x32();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm256_cvtneps_pbh (a: __m256) -> __m128bh {
+    transmute(cvtneps2bf16_256(a.as_f32x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// floating-point elements, and store the results in dst using writemask k 
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm256_mask_cvtneps_pbh (src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// floating-point elements, and store the results in dst using zeromask k 
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm256_maskz_cvtneps_pbh (k: __mmask8, a: __m256) -> __m128bh {
+    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+    let zero = _mm_setzero_si128().as_u16x8();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// floating-point elements, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm512_cvtneps_pbh (a: __m512) -> __m256bh {
+    transmute(cvtneps2bf16_512(a.as_f32x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// floating-point elements, and store the results in dst using writemask k 
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm512_mask_cvtneps_pbh (src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// floating-point elements, and store the results in dst using zeromask k 
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+pub unsafe fn _mm512_maskz_cvtneps_pbh (k: __mmask16, a: __m512) -> __m256bh {
+    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+    let zero = _mm256_setzero_si256().as_u16x16();
+    transmute(simd_select_bitmask(k, cvt, zero))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm_dpbf16_ps (src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst using writemask k 
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm_mask_dpbf16_ps (src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst using zeromask k 
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm_maskz_dpbf16_ps (k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+    let zero = _mm_set1_ps(0.0_f32).as_f32x4();
+    transmute(simd_select_bitmask(k, rst, zero))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm256_dpbf16_ps (src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst using writemask k 
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm256_mask_dpbf16_ps (src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst using zeromask k 
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm256_maskz_dpbf16_ps (k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, rst, zero))
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit) 
+/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit) 
+/// floating-point elements with elements in src, and store the results in dst.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm512_dpbf16_ps (src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16()))
+ }
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst using writemask k 
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm512_mask_dpbf16_ps (src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+ }
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
+/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// with elements in src, and store the results in dst using zeromask k 
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[cfg_attr(test, assert_instr("vdpbf16ps"))]
+pub unsafe fn _mm512_maskz_dpbf16_ps (k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, rst, zero))
+ }
+
+#[cfg(test)]
+mod tests {
+    use crate::{core_arch::x86::*, mem::transmute};
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtne2ps_pbh() {
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtne2ps_pbh(){
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let src_array: [u16; 8] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+        let src: __m128bh = transmute(src_array);
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+        assert_eq!(result, expected_result);
+        let k = 0b0000_0000;
+        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtne2ps_pbh(){
+        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
+        let a: __m128 = transmute(a_array);
+        let b: __m128 = transmute(b_array);
+        let k: __mmask8 = 0b1111_1111;
+        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+        assert_eq!(result, expected_result);
+        let k = 0b0011_1100;
+        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0, 0, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtne2ps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let b_array = [
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtne2ps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let b_array = [
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+        let src_array: [u16; 16] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+        let src: __m256bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let b_array = [
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+        let a: __m256 = transmute(a_array);
+        let b: __m256 = transmute(b_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0b0110_1100_0011_0110;
+        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0, 0b1_10000010_0101000, 0b1_10000000_1110000, 0,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0, 0,
+            0, 0, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0, 0b0_10000110_1111111, 0b0_10001000_1111010, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtne2ps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let b_array = [
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32,
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtne2ps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let b_array = [
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32,
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+        let src_array: [u16; 32] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+        let src: __m512bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0;
+        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let b_array = [
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32,
+            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
+            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+        let a: __m512 = transmute(a_array);
+        let b: __m512 = transmute(b_array);
+        let k: __mmask32 = 0xffffffff;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result: [u16; 32] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
+        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
+        let result: [u16; 32] = transmute(c.as_u16x32());
+        let expected_result: [u16; 32] = [
+            0, 0b1_10000010_0101000, 0b1_10000000_1110000, 0,
+            0b1_10000011_0000100, 0, 0b1_10001000_1111010, 0,
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0, 0,
+            0, 0b1_10000110_1111111, 0, 0b1_10001000_0010000,
+            0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0,
+            0b0_10000011_0000100, 0, 0, 0b0_10001000_0010000,
+            0, 0b0_10000010_0101000, 0, 0b0_10000100_1001001,
+            0, 0, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtneps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let a: __m256 = transmute(a_array);
+        let c: __m128bh = _mm256_cvtneps_pbh(a);
+        let result: [u16; 8] = transmute(c.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtneps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let src_array: [u16; 8] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000];
+        let src: __m128bh = transmute(src_array);
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x0;
+        let b: __m128bh = _mm256_mask_cvtneps_pbh (src, k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = src_array;
+        assert_eq!(result, expected_result);
+    }
+    
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtneps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let a: __m256 = transmute(a_array);
+        let k: __mmask8 = 0xff;
+        let b = _mm256_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0x6;
+        let b: __m128bh = _mm256_maskz_cvtneps_pbh (k, a);
+        let result: [u16; 8] = transmute(b.as_u16x8());
+        let expected_result: [u16; 8] = [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_cvtneps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let a: __m512 = transmute(a_array);
+        let c: __m256bh = _mm512_cvtneps_pbh(a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_cvtneps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let src_array: [u16; 16] = [
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
+            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
+            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000];
+        let src: __m256bh = transmute(src_array);
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result = src_array;
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_cvtneps_pbh() {
+        let a_array = [
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
+            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+        let a: __m512 = transmute(a_array);
+        let k: __mmask16 = 0xffff;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
+            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0x653a;
+        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
+        let result: [u16; 16] = transmute(c.as_u16x16());
+        let expected_result: [u16; 16] = [
+            0, 0b0_10000010_0101000, 0, 0b0_10000100_1001001,
+            0b0_10000011_0000100, 0b0_10000110_1111111, 0, 0,
+            0b0_10000110_0110010, 0, 0b0_10000000_1110000, 0,
+            0, 0b0_10000110_1111111, 0b0_10001000_1111010, 0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_dpbf16_ps (src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_dpbf16_ps() {
+        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m128 = transmute(a_array);
+        let b1: __m128 = transmute(b_array);
+        let k: __mmask8 = 0xf3;
+        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
+        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
+        let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 4] = transmute(c.as_f32x4());
+        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_dpbf16_ps() {
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_dpbf16_ps (src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_dpbf16_ps() {
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, 
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbf16_ps() {
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m256 = transmute(a_array);
+        let b1: __m256 = transmute(b_array);
+        let k: __mmask8 = 0x33;
+        let src: __m256 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
+        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
+        let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, 
+            -18.0_f32, -52.0_f32, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0xff;
+        let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask8 = 0;
+        let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 8] = transmute(c.as_f32x8());
+        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_dpbf16_ps() {
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let src :__m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_dpbf16_ps (src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_mask_dpbf16_ps() {
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, 
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, 
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+        assert_eq!(result, expected_result);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512f")]
+    unsafe fn test_mm512_maskz_dpbf16_ps() {
+        let a_array = [
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+        let b_array = [
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+        let a1: __m512 = transmute(a_array);
+        let b1: __m512 = transmute(b_array);
+        let k: __mmask16 = 0x3333;
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
+        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, 0.0, 0.0, 
+            -18.0_f32, -52.0_f32, 0.0, 0.0,
+            -18.0_f32, -52.0_f32, 0.0, 0.0,
+            -18.0_f32, -52.0_f32, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0xffff;
+        let c: __m512 = _mm512_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+        assert_eq!(result, expected_result);
+        let k: __mmask16 = 0;
+        let c: __m512 = _mm512_maskz_dpbf16_ps (k, src, a, b);
+        let result: [f32; 16] = transmute(c.as_f32x16());
+        let expected_result: [f32; 16] = [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,  
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+        assert_eq!(result, expected_result);
+    }
+}
\ No newline at end of file
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
index 4e63ca8845f9..5f3f2e97cfec 100644
--- a/library/stdarch/crates/core_arch/src/x86/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -296,6 +296,37 @@ types! {
     /// suffixed with "pd" (or otherwise contain "pd"). Not to be confused with
     /// "ps" which is used for `__m512`.
     pub struct __m512d(f64, f64, f64, f64, f64, f64, f64, f64);
+
+    /// 128-bit wide set of eight 'u16' types, x86-specific
+    ///
+    /// This type is representing a 128-bit SIMD register which internally is consisted of
+    /// eight packed `u16` instances. It's purpose is for bf16 related intrinsic
+    /// implementations.
+    pub struct __m128bh(u16, u16, u16, u16, u16, u16, u16, u16);
+
+    /// 256-bit wide set of 16 'u16' types, x86-specific
+    ///
+    /// This type is the same as the `__m128bh` type defined by Intel,
+    /// representing a 256-bit SIMD register which internally is consisted of
+    /// 16 packed `u16` instances. It's purpose is for bf16 related intrinsic
+    /// implementations.
+    pub struct __m256bh(
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16
+    );
+
+    /// 512-bit wide set of 32 'u16' types, x86-specific
+    ///
+    /// This type is the same as the `__m128bh` type defined by Intel,
+    /// representing a 512-bit SIMD register which internally is consisted of
+    /// 32 packed `u16` instances. It's purpose is for bf16 related intrinsic
+    /// implementations.
+    pub struct __m512bh(
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16,
+        u16, u16, u16, u16, u16, u16, u16, u16
+    );
 }
 
 /// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
@@ -602,6 +633,105 @@ impl m512dExt for __m512d {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128bhExt: Sized {
+    fn as_m128bh(self) -> __m128bh;
+
+    #[inline]
+    fn as_u16x8(self) -> crate::core_arch::simd::u16x8 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+
+    #[inline]
+    fn as_i16x8(self) -> crate::core_arch::simd::i16x8 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+
+    #[inline]
+    fn as_u32x4(self) -> crate::core_arch::simd::u32x4 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+
+    #[inline]
+    fn as_i32x4(self) -> crate::core_arch::simd::i32x4 {
+        unsafe { transmute(self.as_m128bh()) }
+    }
+}
+
+impl m128bhExt for __m128bh {
+    #[inline]
+    fn as_m128bh(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m256bhExt: Sized {
+    fn as_m256bh(self) -> __m256bh;
+
+    #[inline]
+    fn as_u16x16(self) -> crate::core_arch::simd::u16x16 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+
+    #[inline]
+    fn as_i16x16(self) -> crate::core_arch::simd::i16x16 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+
+    #[inline]
+    fn as_u32x8(self) -> crate::core_arch::simd::u32x8 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+
+    #[inline]
+    fn as_i32x8(self) -> crate::core_arch::simd::i32x8 {
+        unsafe { transmute(self.as_m256bh()) }
+    }
+}
+
+impl m256bhExt for __m256bh {
+    #[inline]
+    fn as_m256bh(self) -> Self {
+        self
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m512bhExt: Sized {
+    fn as_m512bh(self) -> __m512bh;
+
+    #[inline]
+    fn as_u16x32(self) -> crate::core_arch::simd::u16x32 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+
+    #[inline]
+    fn as_i16x32(self) -> crate::core_arch::simd::i16x32 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+
+    #[inline]
+    fn as_u32x16(self) -> crate::core_arch::simd::u32x16 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+
+    #[inline]
+    fn as_i32x16(self) -> crate::core_arch::simd::i32x16 {
+        unsafe { transmute(self.as_m512bh()) }
+    }
+}
+
+impl m512bhExt for __m512bh {
+    #[inline]
+    fn as_m512bh(self) -> Self {
+        self
+    }
+}
+
 mod eflags;
 pub use self::eflags::*;
 
@@ -725,3 +855,6 @@ pub use self::rtm::*;
 
 mod f16c;
 pub use self::f16c::*;
+
+mod avx512bf16;
+pub use self::avx512bf16::*;
diff --git a/library/stdarch/crates/stdarch-verify/src/lib.rs b/library/stdarch/crates/stdarch-verify/src/lib.rs
index acf22d19da02..4aa969ba5185 100644
--- a/library/stdarch/crates/stdarch-verify/src/lib.rs
+++ b/library/stdarch/crates/stdarch-verify/src/lib.rs
@@ -137,12 +137,15 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
         syn::Type::Path(ref p) => match extract_path_ident(&p.path).to_string().as_ref() {
             // x86 ...
             "__m128" => quote! { &M128 },
+            "__m128bh" => quote! { &M128BH },
             "__m128d" => quote! { &M128D },
             "__m128i" => quote! { &M128I },
             "__m256" => quote! { &M256 },
+            "__m256bh" => quote! { &M256BH },
             "__m256d" => quote! { &M256D },
             "__m256i" => quote! { &M256I },
             "__m512" => quote! { &M512 },
+            "__m512bh" => quote! { &M512BH },
             "__m512d" => quote! { &M512D },
             "__m512i" => quote! { &M512I },
             "__mmask8" => quote! { &MMASK8 },
diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
index b867f55db129..6147863cb6fe 100644
--- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@@ -45,12 +45,15 @@ static ORDERING: Type = Type::Ordering;
 
 static M64: Type = Type::M64;
 static M128: Type = Type::M128;
+static M128BH: Type = Type::M128BH;
 static M128I: Type = Type::M128I;
 static M128D: Type = Type::M128D;
 static M256: Type = Type::M256;
+static M256BH: Type = Type::M256BH;
 static M256I: Type = Type::M256I;
 static M256D: Type = Type::M256D;
 static M512: Type = Type::M512;
+static M512BH: Type = Type::M512BH;
 static M512I: Type = Type::M512I;
 static M512D: Type = Type::M512D;
 static MMASK8: Type = Type::MMASK8;
@@ -75,12 +78,15 @@ enum Type {
     ConstPtr(&'static Type),
     M64,
     M128,
+    M128BH,
     M128D,
     M128I,
     M256,
+    M256BH,
     M256D,
     M256I,
     M512,
+    M512BH,
     M512D,
     M512I,
     MMASK8,
@@ -493,6 +499,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
             // The intrinsics guide calls `f16c` `fp16c` in disagreement with
             // Intel's architecture manuals.
             "fp16c" => String::from("f16c"),
+            "avx512_bf16" => String::from("avx512bf16"),
+            // The XML file names VNNI as "avx512_bf16", while Rust calls
+            // it "avx512bf16".
             _ => cpuid,
         };
         let fixed_cpuid = fixup_cpuid(cpuid);
@@ -693,12 +702,15 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(),
         (&Type::PrimUnsigned(8), "unsigned char") => {}
         (&Type::M64, "__m64") => {}
         (&Type::M128, "__m128") => {}
+        (&Type::M128BH, "__m128bh") => {}
         (&Type::M128I, "__m128i") => {}
         (&Type::M128D, "__m128d") => {}
         (&Type::M256, "__m256") => {}
+        (&Type::M256BH, "__m256bh") => {}
         (&Type::M256I, "__m256i") => {}
         (&Type::M256D, "__m256d") => {}
         (&Type::M512, "__m512") => {}
+        (&Type::M512BH, "__m512bh") => {}
         (&Type::M512I, "__m512i") => {}
         (&Type::M512D, "__m512d") => {}
         (&Type::MMASK64, "__mmask64") => {}
@@ -726,12 +738,15 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(),
         (&Type::MutPtr(&Type::PrimUnsigned(64)), "__mmask64*") => {}
         (&Type::MutPtr(&Type::M64), "__m64*") => {}
         (&Type::MutPtr(&Type::M128), "__m128*") => {}
+        (&Type::MutPtr(&Type::M128BH), "__m128bh*") => {}
         (&Type::MutPtr(&Type::M128I), "__m128i*") => {}
         (&Type::MutPtr(&Type::M128D), "__m128d*") => {}
         (&Type::MutPtr(&Type::M256), "__m256*") => {}
+        (&Type::MutPtr(&Type::M256BH), "__m256bh*") => {}
         (&Type::MutPtr(&Type::M256I), "__m256i*") => {}
         (&Type::MutPtr(&Type::M256D), "__m256d*") => {}
         (&Type::MutPtr(&Type::M512), "__m512*") => {}
+        (&Type::MutPtr(&Type::M512BH), "__m512bh*") => {}
         (&Type::MutPtr(&Type::M512I), "__m512i*") => {}
         (&Type::MutPtr(&Type::M512D), "__m512d*") => {}
 
@@ -754,12 +769,15 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(),
         (&Type::ConstPtr(&Type::PrimUnsigned(32)), "void const*") => {}
         (&Type::ConstPtr(&Type::M64), "__m64 const*") => {}
         (&Type::ConstPtr(&Type::M128), "__m128 const*") => {}
+        (&Type::ConstPtr(&Type::M128BH), "__m128bh const*") => {}
         (&Type::ConstPtr(&Type::M128I), "__m128i const*") => {}
         (&Type::ConstPtr(&Type::M128D), "__m128d const*") => {}
         (&Type::ConstPtr(&Type::M256), "__m256 const*") => {}
+        (&Type::ConstPtr(&Type::M256BH), "__m256bh const*") => {}
         (&Type::ConstPtr(&Type::M256I), "__m256i const*") => {}
         (&Type::ConstPtr(&Type::M256D), "__m256d const*") => {}
         (&Type::ConstPtr(&Type::M512), "__m512 const*") => {}
+        (&Type::ConstPtr(&Type::M512BH), "__m512bh const*") => {}
         (&Type::ConstPtr(&Type::M512I), "__m512i const*") => {}
         (&Type::ConstPtr(&Type::M512D), "__m512d const*") => {}
         (&Type::ConstPtr(&Type::PrimUnsigned(32)), "__mmask32*") => {}