mark AVX512 & AVXNECONVERT intrinsics as safe

Mark all AVX512 & AVXNECONVERT SIMD-computing intrinsics as safe, except for those involving memory operations.
2025-01-28 00:00:50 +08:00 · 2025-01-28 00:00:50 +08:00 · f53c07b3ff
commit f53c07b3ff
parent 2348f153ae
16 changed files with 21793 additions and 18447 deletions
--- a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
@ -37,8 +37,8 @@ unsafe extern "C" {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
-    transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4()))
+pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
+    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@ -50,9 +50,11 @@ pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
-    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@ -64,9 +66,11 @@ pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
-    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
@ -77,8 +81,8 @@ pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m12
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
-    transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8()))
+pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
+    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
@ -89,14 +93,11 @@ pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_mask_cvtne2ps_pbh(
-    src: __m256bh,
-    k: __mmask16,
-    a: __m256,
-    b: __m256,
-) -> __m256bh {
-    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
@ -107,9 +108,11 @@ pub unsafe fn _mm256_mask_cvtne2ps_pbh(
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
-    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
@ -120,8 +123,8 @@ pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> _
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
-    transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16()))
+pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
+    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@ -133,14 +136,11 @@ pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_mask_cvtne2ps_pbh(
-    src: __m512bh,
-    k: __mmask32,
-    a: __m512,
-    b: __m512,
-) -> __m512bh {
-    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@ -152,9 +152,11 @@ pub unsafe fn _mm512_mask_cvtne2ps_pbh(
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
-    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
+pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -164,8 +166,8 @@ pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> _
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
-    transmute(cvtneps2bf16_256(a.as_f32x8()))
+pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
+    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -176,9 +178,11 @@ pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
-    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -189,9 +193,11 @@ pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) ->
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
-    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -201,8 +207,8 @@ pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
-    transmute(cvtneps2bf16_512(a.as_f32x16()))
+pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
+    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -213,9 +219,11 @@ pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
-    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -226,9 +234,11 @@ pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) ->
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
-    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -239,8 +249,8 @@ pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
-    transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -252,9 +262,11 @@ pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
-    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+    }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -266,10 +278,12 @@ pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m12
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
-    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
-    let zero = _mm_set1_ps(0.0_f32).as_f32x4();
-    transmute(simd_select_bitmask(k, rst, zero))
+pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, zero))
+    }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -280,8 +294,8 @@ pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m1
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
-    transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -293,9 +307,11 @@ pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
-    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+    }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -307,9 +323,11 @@ pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
-    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
+pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
+    }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -322,8 +340,8 @@ pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: _
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
-    transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16())) }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -335,9 +353,11 @@ pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
-    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+    }
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -349,14 +369,11 @@ pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: _
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_maskz_dpbf16_ps(
-    k: __mmask16,
-    src: __m512,
-    a: __m512bh,
-    b: __m512bh,
-) -> __m512 {
-    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
+pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
+    }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -366,8 +383,8 @@ pub unsafe fn _mm512_maskz_dpbf16_ps(
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
-    _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a))))
+pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
+    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -378,9 +395,11 @@ pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
-    let cvt = _mm512_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+    }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -391,9 +410,11 @@ pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> _
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
-    let cvt = _mm512_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
+pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
+    }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -403,8 +424,8 @@ pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
-    _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a))))
+pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
+    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -415,9 +436,11 @@ pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
-    let cvt = _mm256_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+    }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -428,9 +451,11 @@ pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
-    let cvt = _mm256_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
+    }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@ -440,8 +465,8 @@ pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
-    _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a))))
+pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
+    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@ -452,9 +477,11 @@ pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
-    let cvt = _mm_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+    }
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@ -465,9 +492,11 @@ pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m12
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
-    let cvt = _mm_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
+    }
 }

 /// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
@ -477,7 +506,7 @@ pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
+pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
    f32::from_bits((a.to_bits() as u32) << 16)
 }

@ -489,15 +518,17 @@ pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "vcvtneps2bf16 {dst}, {src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}, {src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }

 /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -509,16 +540,18 @@ pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
-    let mut dst = src;
-    asm!(
-        "vcvtneps2bf16 {dst}{{{k}}},{src}",
-        dst = inlateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        k = in(kreg) k,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst = src;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}},{src}",
+            dst = inlateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }

 /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -530,16 +563,18 @@ pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        k = in(kreg) k,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }

 /// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
@ -549,9 +584,11 @@ pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub unsafe fn _mm_cvtness_sbh(a: f32) -> bf16 {
-    let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
-    bf16::from_bits(value)
+pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
+    unsafe {
+        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
+        bf16::from_bits(value)
+    }
 }

 #[cfg(test)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@ -43,8 +43,8 @@ unsafe extern "C" {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i16x32()))
+pub fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i16x32())) }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -57,12 +57,14 @@ pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x32()),
-        i16x32::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            i16x32::ZERO,
+        ))
+    }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -75,12 +77,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x32()),
-        src.as_i16x32(),
-    ))
+pub fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            src.as_i16x32(),
+        ))
+    }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -90,8 +94,8 @@ pub unsafe fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i16x16()))
+pub fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i16x16())) }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -104,12 +108,14 @@ pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x16()),
-        i16x16::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            i16x16::ZERO,
+        ))
+    }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -122,12 +128,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x16()),
-        src.as_i16x16(),
-    ))
+pub fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            src.as_i16x16(),
+        ))
+    }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -137,8 +145,8 @@ pub unsafe fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i16x8()))
+pub fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i16x8())) }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -151,12 +159,14 @@ pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x8()),
-        i16x8::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            i16x8::ZERO,
+        ))
+    }
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -169,12 +179,14 @@ pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x8()),
-        src.as_i16x8(),
-    ))
+pub fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            src.as_i16x8(),
+        ))
+    }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -184,8 +196,8 @@ pub unsafe fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i8x64()))
+pub fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i8x64())) }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -198,12 +210,14 @@ pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x64()),
-        i8x64::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            i8x64::ZERO,
+        ))
+    }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -216,12 +230,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x64()),
-        src.as_i8x64(),
-    ))
+pub fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            src.as_i8x64(),
+        ))
+    }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -231,8 +247,8 @@ pub unsafe fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) ->
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i8x32()))
+pub fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i8x32())) }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -245,12 +261,14 @@ pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x32()),
-        i8x32::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            i8x32::ZERO,
+        ))
+    }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -263,12 +281,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x32()),
-        src.as_i8x32(),
-    ))
+pub fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            src.as_i8x32(),
+        ))
+    }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -278,8 +298,8 @@ pub unsafe fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) ->
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i8x16()))
+pub fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i8x16())) }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -292,12 +312,14 @@ pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x16()),
-        i8x16::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            i8x16::ZERO,
+        ))
+    }
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -310,12 +332,14 @@ pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x16()),
-        src.as_i8x16(),
-    ))
+pub fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            src.as_i8x16(),
+        ))
+    }
 }

 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@ -327,8 +351,8 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
-    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)
+pub fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0) }
 }

 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@ -343,8 +367,8 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
-    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)
+pub fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k) }
 }

 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@ -356,8 +380,8 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
-    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)
+pub fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0) }
 }

 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@ -372,8 +396,8 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
-    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)
+pub fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k) }
 }

 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@ -385,8 +409,8 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
-    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)
+pub fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0) }
 }

 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@ -401,8 +425,8 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
-    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)
+pub fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k) }
 }

 #[cfg(test)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
@ -11,7 +11,7 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub unsafe fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
+pub fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
    _mm512_set1_epi32(k as i32)
 }

@ -22,7 +22,7 @@ pub unsafe fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub unsafe fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
+pub fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
    _mm256_set1_epi32(k as i32)
 }

@ -33,7 +33,7 @@ pub unsafe fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub unsafe fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
+pub fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
    _mm_set1_epi32(k as i32)
 }

@ -44,7 +44,7 @@ pub unsafe fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub unsafe fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
+pub fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
    _mm512_set1_epi64(k as i64)
 }

@ -55,7 +55,7 @@ pub unsafe fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub unsafe fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
+pub fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
    _mm256_set1_epi64x(k as i64)
 }

@ -66,7 +66,7 @@ pub unsafe fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub unsafe fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
+pub fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
    _mm_set1_epi64x(k as i64)
 }

@ -77,8 +77,8 @@ pub unsafe fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
-    transmute(vpconflictd(a.as_i32x16()))
+pub fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictd(a.as_i32x16())) }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -88,9 +88,11 @@ pub unsafe fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+pub fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+    }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -100,9 +102,11 @@ pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i)
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
+pub fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
+    }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -112,8 +116,8 @@ pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
-    transmute(vpconflictd256(a.as_i32x8()))
+pub fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictd256(a.as_i32x8())) }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -123,9 +127,11 @@ pub unsafe fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+pub fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+    }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -135,9 +141,11 @@ pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
+pub fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
+    }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -147,8 +155,8 @@ pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm_conflict_epi32(a: __m128i) -> __m128i {
-    transmute(vpconflictd128(a.as_i32x4()))
+pub fn _mm_conflict_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictd128(a.as_i32x4())) }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -158,9 +166,11 @@ pub unsafe fn _mm_conflict_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+pub fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+    }
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -170,9 +180,11 @@ pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
+pub fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
+    }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -182,8 +194,8 @@ pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
-    transmute(vpconflictq(a.as_i64x8()))
+pub fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictq(a.as_i64x8())) }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -193,9 +205,11 @@ pub unsafe fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+pub fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+    }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -205,9 +219,11 @@ pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
+pub fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
+    }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -217,8 +233,8 @@ pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
-    transmute(vpconflictq256(a.as_i64x4()))
+pub fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictq256(a.as_i64x4())) }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -228,9 +244,11 @@ pub unsafe fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+pub fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+    }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -240,9 +258,11 @@ pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
+pub fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
+    }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -252,8 +272,8 @@ pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm_conflict_epi64(a: __m128i) -> __m128i {
-    transmute(vpconflictq128(a.as_i64x2()))
+pub fn _mm_conflict_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictq128(a.as_i64x2())) }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -263,9 +283,11 @@ pub unsafe fn _mm_conflict_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+pub fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+    }
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@ -275,9 +297,11 @@ pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
+pub fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
+    }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@ -287,8 +311,8 @@ pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
-    transmute(simd_ctlz(a.as_i32x16()))
+pub fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i32x16())) }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -298,9 +322,11 @@ pub unsafe fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+pub fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+    }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -310,9 +336,11 @@ pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) ->
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
+pub fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
+    }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@ -322,8 +350,8 @@ pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
-    transmute(simd_ctlz(a.as_i32x8()))
+pub fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i32x8())) }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -333,9 +361,11 @@ pub unsafe fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+pub fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+    }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -345,9 +375,11 @@ pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
+pub fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
+    }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@ -357,8 +389,8 @@ pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
-    transmute(simd_ctlz(a.as_i32x4()))
+pub fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i32x4())) }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -368,9 +400,11 @@ pub unsafe fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+pub fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+    }
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -380,9 +414,11 @@ pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
+pub fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
+    }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@ -392,8 +428,8 @@ pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
-    transmute(simd_ctlz(a.as_i64x8()))
+pub fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i64x8())) }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -403,9 +439,11 @@ pub unsafe fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+pub fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+    }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -415,9 +453,11 @@ pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
+pub fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
+    }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@ -427,8 +467,8 @@ pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
-    transmute(simd_ctlz(a.as_i64x4()))
+pub fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i64x4())) }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -438,9 +478,11 @@ pub unsafe fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+pub fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+    }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -450,9 +492,11 @@ pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
+pub fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
+    }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@ -462,8 +506,8 @@ pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
-    transmute(simd_ctlz(a.as_i64x2()))
+pub fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i64x2())) }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -473,9 +517,11 @@ pub unsafe fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+pub fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+    }
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -485,9 +531,11 @@ pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
+pub fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
+    }
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
@ -15,8 +15,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    vpmadd52huq_512(a, b, c)
+pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52huq_512(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -31,13 +31,8 @@ pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm512_mask_madd52hi_epu64(
-    a: __m512i,
-    k: __mmask8,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a)
+pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -52,13 +47,8 @@ pub unsafe fn _mm512_mask_madd52hi_epu64(
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm512_maskz_madd52hi_epu64(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512())
+pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512()) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -72,8 +62,8 @@ pub unsafe fn _mm512_maskz_madd52hi_epu64(
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    vpmadd52luq_512(a, b, c)
+pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52luq_512(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -88,13 +78,8 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm512_mask_madd52lo_epu64(
-    a: __m512i,
-    k: __mmask8,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a)
+pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -109,13 +94,8 @@ pub unsafe fn _mm512_mask_madd52lo_epu64(
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm512_maskz_madd52lo_epu64(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
+pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -132,8 +112,8 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpmadd52huq)
 )]
-pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52huq_256(a, b, c)
+pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -147,8 +127,8 @@ pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> _
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52huq_256(a, b, c)
+pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -163,13 +143,8 @@ pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm256_mask_madd52hi_epu64(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a)
+pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -184,13 +159,8 @@ pub unsafe fn _mm256_mask_madd52hi_epu64(
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm256_maskz_madd52hi_epu64(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
+pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -207,8 +177,8 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpmadd52luq)
 )]
-pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52luq_256(a, b, c)
+pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -222,8 +192,8 @@ pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> _
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52luq_256(a, b, c)
+pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -238,13 +208,8 @@ pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm256_mask_madd52lo_epu64(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a)
+pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -259,13 +224,8 @@ pub unsafe fn _mm256_mask_madd52lo_epu64(
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm256_maskz_madd52lo_epu64(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
+pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -282,8 +242,8 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpmadd52huq)
 )]
-pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52huq_128(a, b, c)
+pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -297,8 +257,8 @@ pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m1
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52huq_128(a, b, c)
+pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -313,8 +273,8 @@ pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a)
+pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -329,8 +289,8 @@ pub unsafe fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
+pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -347,8 +307,8 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpmadd52luq)
 )]
-pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52luq_128(a, b, c)
+pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -362,8 +322,8 @@ pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m1
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52luq_128(a, b, c)
+pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -378,8 +338,8 @@ pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a)
+pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a) }
 }

 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@ -394,8 +354,8 @@ pub unsafe fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128())
+pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128()) }
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
@ -11,8 +11,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64())) }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -22,14 +22,16 @@ pub unsafe fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2b))]
-pub unsafe fn _mm512_mask_permutex2var_epi8(
+pub fn _mm512_mask_permutex2var_epi8(
    a: __m512i,
    k: __mmask64,
    idx: __m512i,
    b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -39,14 +41,16 @@ pub unsafe fn _mm512_mask_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm512_maskz_permutex2var_epi8(
+pub fn _mm512_maskz_permutex2var_epi8(
    k: __mmask64,
    a: __m512i,
    idx: __m512i,
    b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -56,14 +60,16 @@ pub unsafe fn _mm512_maskz_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2b))]
-pub unsafe fn _mm512_mask2_permutex2var_epi8(
+pub fn _mm512_mask2_permutex2var_epi8(
    a: __m512i,
    idx: __m512i,
    k: __mmask64,
    b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@ -73,8 +79,8 @@ pub unsafe fn _mm512_mask2_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32())) }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -84,14 +90,16 @@ pub unsafe fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2b))]
-pub unsafe fn _mm256_mask_permutex2var_epi8(
+pub fn _mm256_mask_permutex2var_epi8(
    a: __m256i,
    k: __mmask32,
    idx: __m256i,
    b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -101,14 +109,16 @@ pub unsafe fn _mm256_mask_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm256_maskz_permutex2var_epi8(
+pub fn _mm256_maskz_permutex2var_epi8(
    k: __mmask32,
    a: __m256i,
    idx: __m256i,
    b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -118,14 +128,16 @@ pub unsafe fn _mm256_maskz_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2b))]
-pub unsafe fn _mm256_mask2_permutex2var_epi8(
+pub fn _mm256_mask2_permutex2var_epi8(
    a: __m256i,
    idx: __m256i,
    k: __mmask32,
    b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@ -135,8 +147,8 @@ pub unsafe fn _mm256_mask2_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16()))
+pub fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16())) }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -146,14 +158,11 @@ pub unsafe fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2b))]
-pub unsafe fn _mm_mask_permutex2var_epi8(
-    a: __m128i,
-    k: __mmask16,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+pub fn _mm_mask_permutex2var_epi8(a: __m128i, k: __mmask16, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -163,14 +172,11 @@ pub unsafe fn _mm_mask_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm_maskz_permutex2var_epi8(
-    k: __mmask16,
-    a: __m128i,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+pub fn _mm_maskz_permutex2var_epi8(k: __mmask16, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -180,14 +186,11 @@ pub unsafe fn _mm_maskz_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2b))]
-pub unsafe fn _mm_mask2_permutex2var_epi8(
-    a: __m128i,
-    idx: __m128i,
-    k: __mmask16,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+pub fn _mm_mask2_permutex2var_epi8(a: __m128i, idx: __m128i, k: __mmask16, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+    }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@ -197,8 +200,8 @@ pub unsafe fn _mm_mask2_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
-    transmute(vpermb(a.as_i8x64(), idx.as_i8x64()))
+pub fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermb(a.as_i8x64(), idx.as_i8x64())) }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -208,14 +211,16 @@ pub unsafe fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm512_mask_permutexvar_epi8(
+pub fn _mm512_mask_permutexvar_epi8(
    src: __m512i,
    k: __mmask64,
    idx: __m512i,
    a: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+    }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -225,9 +230,11 @@ pub unsafe fn _mm512_mask_permutexvar_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
-    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+pub fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@ -237,8 +244,8 @@ pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m51
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
-    transmute(vpermb256(a.as_i8x32(), idx.as_i8x32()))
+pub fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermb256(a.as_i8x32(), idx.as_i8x32())) }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -248,14 +255,16 @@ pub unsafe fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm256_mask_permutexvar_epi8(
+pub fn _mm256_mask_permutexvar_epi8(
    src: __m256i,
    k: __mmask32,
    idx: __m256i,
    a: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+    }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -265,9 +274,11 @@ pub unsafe fn _mm256_mask_permutexvar_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
-    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+pub fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@ -277,8 +288,8 @@ pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m25
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
-    transmute(vpermb128(a.as_i8x16(), idx.as_i8x16()))
+pub fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermb128(a.as_i8x16(), idx.as_i8x16())) }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -288,14 +299,11 @@ pub unsafe fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm_mask_permutexvar_epi8(
-    src: __m128i,
-    k: __mmask16,
-    idx: __m128i,
-    a: __m128i,
-) -> __m128i {
-    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+pub fn _mm_mask_permutexvar_epi8(src: __m128i, k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+    }
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -305,9 +313,11 @@ pub unsafe fn _mm_mask_permutexvar_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
-    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+pub fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@ -317,8 +327,8 @@ pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i)
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64())) }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -328,14 +338,16 @@ pub unsafe fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm512_mask_multishift_epi64_epi8(
+pub fn _mm512_mask_multishift_epi64_epi8(
    src: __m512i,
    k: __mmask64,
    a: __m512i,
    b: __m512i,
 ) -> __m512i {
-    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+    }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -345,9 +357,11 @@ pub unsafe fn _mm512_mask_multishift_epi64_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
+pub fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
+    }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@ -357,8 +371,8 @@ pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32())) }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -368,14 +382,16 @@ pub unsafe fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm256_mask_multishift_epi64_epi8(
+pub fn _mm256_mask_multishift_epi64_epi8(
    src: __m256i,
    k: __mmask32,
    a: __m256i,
    b: __m256i,
 ) -> __m256i {
-    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+    }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -385,9 +401,11 @@ pub unsafe fn _mm256_mask_multishift_epi64_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
+pub fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
+    }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@ -397,8 +415,8 @@ pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16())) }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -408,14 +426,16 @@ pub unsafe fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm_mask_multishift_epi64_epi8(
+pub fn _mm_mask_multishift_epi64_epi8(
    src: __m128i,
    k: __mmask16,
    a: __m128i,
    b: __m128i,
 ) -> __m128i {
-    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+    }
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -425,9 +445,11 @@ pub unsafe fn _mm_mask_multishift_epi64_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
+pub fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
+    }
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
@ -11,8 +11,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -22,14 +22,11 @@ pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m51
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_mask_dpwssd_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -39,14 +36,11 @@ pub unsafe fn _mm512_mask_dpwssd_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_maskz_dpwssd_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -59,8 +53,8 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpwssd)
 )]
-pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -70,8 +64,8 @@ pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -81,14 +75,11 @@ pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_mask_dpwssd_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -98,14 +89,11 @@ pub unsafe fn _mm256_mask_dpwssd_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_maskz_dpwssd_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -118,8 +106,8 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpwssd)
 )]
-pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -129,8 +117,8 @@ pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -140,9 +128,11 @@ pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -152,9 +142,11 @@ pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -164,8 +156,8 @@ pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -175,14 +167,11 @@ pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m5
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_mask_dpwssds_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -192,14 +181,11 @@ pub unsafe fn _mm512_mask_dpwssds_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_maskz_dpwssds_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -212,8 +198,8 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpwssds)
 )]
-pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -223,8 +209,8 @@ pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -234,14 +220,11 @@ pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_mask_dpwssds_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -251,14 +234,11 @@ pub unsafe fn _mm256_mask_dpwssds_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_maskz_dpwssds_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -271,8 +251,8 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpwssds)
 )]
-pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -282,8 +262,8 @@ pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -293,9 +273,11 @@ pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -305,14 +287,11 @@ pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_maskz_dpwssds_epi32(
-    k: __mmask8,
-    src: __m128i,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -322,8 +301,8 @@ pub unsafe fn _mm_maskz_dpwssds_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -333,14 +312,11 @@ pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m51
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_mask_dpbusd_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -350,14 +326,11 @@ pub unsafe fn _mm512_mask_dpbusd_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_maskz_dpbusd_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -370,8 +343,8 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpbusd)
 )]
-pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -381,8 +354,8 @@ pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -392,14 +365,11 @@ pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_mask_dpbusd_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -409,14 +379,11 @@ pub unsafe fn _mm256_mask_dpbusd_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_maskz_dpbusd_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -429,8 +396,8 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpbusd)
 )]
-pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -440,8 +407,8 @@ pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -451,9 +418,11 @@ pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -463,9 +432,11 @@ pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -475,8 +446,8 @@ pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -486,14 +457,11 @@ pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m5
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_mask_dpbusds_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -503,14 +471,11 @@ pub unsafe fn _mm512_mask_dpbusds_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_maskz_dpbusds_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -523,8 +488,8 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpbusds)
 )]
-pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -534,8 +499,8 @@ pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -545,14 +510,11 @@ pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_mask_dpbusds_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -562,14 +524,11 @@ pub unsafe fn _mm256_mask_dpbusds_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_maskz_dpbusds_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -582,8 +541,8 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
    all(test, any(target_os = "linux", target_env = "msvc")),
    assert_instr(vpdpbusds)
 )]
-pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -593,8 +552,8 @@ pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -604,9 +563,11 @@ pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@ -616,14 +577,11 @@ pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_maskz_dpbusds_epi32(
-    k: __mmask8,
-    src: __m128i,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@ -638,8 +596,8 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
    assert_instr(vpdpbssd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@ -654,8 +612,8 @@ pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpbssd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@ -670,8 +628,8 @@ pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
    assert_instr(vpdpbssds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@ -686,8 +644,8 @@ pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpbssds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@ -702,8 +660,8 @@ pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
    assert_instr(vpdpbsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@ -718,8 +676,8 @@ pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpbsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@ -734,8 +692,8 @@ pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
    assert_instr(vpdpbsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@ -750,8 +708,8 @@ pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpbsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@ -766,8 +724,8 @@ pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
    assert_instr(vpdpbuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@ -782,8 +740,8 @@ pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpbuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@ -798,8 +756,8 @@ pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
    assert_instr(vpdpbuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@ -814,8 +772,8 @@ pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpbuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@ -830,8 +788,8 @@ pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
    assert_instr(vpdpwsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@ -846,8 +804,8 @@ pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpwsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@ -862,8 +820,8 @@ pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
    assert_instr(vpdpwsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@ -878,8 +836,8 @@ pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpwsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@ -894,8 +852,8 @@ pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
    assert_instr(vpdpwusd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@ -910,8 +868,8 @@ pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpwusd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@ -926,8 +884,8 @@ pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
    assert_instr(vpdpwusds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@ -942,8 +900,8 @@ pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpwusds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@ -958,8 +916,8 @@ pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
    assert_instr(vpdpwuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@ -974,8 +932,8 @@ pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpwuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@ -990,8 +948,8 @@ pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
    assert_instr(vpdpwuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@ -1006,8 +964,8 @@ pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
    assert_instr(vpdpwuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
@ -26,8 +26,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i32x16()))
+pub fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i32x16())) }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -40,12 +40,14 @@ pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x16()),
-        i32x16::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            i32x16::ZERO,
+        ))
+    }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -58,12 +60,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x16()),
-        src.as_i32x16(),
-    ))
+pub fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            src.as_i32x16(),
+        ))
+    }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -73,8 +77,8 @@ pub unsafe fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i32x8()))
+pub fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i32x8())) }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -87,12 +91,14 @@ pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x8()),
-        i32x8::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            i32x8::ZERO,
+        ))
+    }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -105,12 +111,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x8()),
-        src.as_i32x8(),
-    ))
+pub fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            src.as_i32x8(),
+        ))
+    }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -120,8 +128,8 @@ pub unsafe fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i32x4()))
+pub fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i32x4())) }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -134,12 +142,14 @@ pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x4()),
-        i32x4::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            i32x4::ZERO,
+        ))
+    }
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -152,12 +162,14 @@ pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x4()),
-        src.as_i32x4(),
-    ))
+pub fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            src.as_i32x4(),
+        ))
+    }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -167,8 +179,8 @@ pub unsafe fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i64x8()))
+pub fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i64x8())) }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -181,12 +193,14 @@ pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x8()),
-        i64x8::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            i64x8::ZERO,
+        ))
+    }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -199,12 +213,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x8()),
-        src.as_i64x8(),
-    ))
+pub fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            src.as_i64x8(),
+        ))
+    }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -214,8 +230,8 @@ pub unsafe fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i64x4()))
+pub fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i64x4())) }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -228,12 +244,14 @@ pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x4()),
-        i64x4::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            i64x4::ZERO,
+        ))
+    }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -246,12 +264,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x4()),
-        src.as_i64x4(),
-    ))
+pub fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            src.as_i64x4(),
+        ))
+    }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -261,8 +281,8 @@ pub unsafe fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i64x2()))
+pub fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i64x2())) }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -275,12 +295,14 @@ pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x2()),
-        i64x2::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            i64x2::ZERO,
+        ))
+    }
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -293,12 +315,14 @@ pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x2()),
-        src.as_i64x2(),
-    ))
+pub fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            src.as_i64x2(),
+        ))
+    }
 }

 #[cfg(test)]
--- a/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
@ -199,15 +199,17 @@ pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
    assert_instr(vcvtneps2bf16)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "{{vex}}vcvtneps2bf16 {dst},{src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }

 /// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
@ -221,15 +223,17 @@ pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
    assert_instr(vcvtneps2bf16)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "{{vex}}vcvtneps2bf16 {dst},{src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(ymm_reg) a,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(ymm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86_64/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512bw.rs
@ -6,7 +6,7 @@ use crate::core_arch::x86::*;
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtmask64_u64(a: __mmask64) -> u64 {
+pub fn _cvtmask64_u64(a: __mmask64) -> u64 {
    a
 }

@ -16,7 +16,7 @@ pub unsafe fn _cvtmask64_u64(a: __mmask64) -> u64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtu64_mask64(a: u64) -> __mmask64 {
+pub fn _cvtu64_mask64(a: u64) -> __mmask64 {
    a
 }

--- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@ -13,7 +13,7 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
-pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
+pub fn _mm_cvtsd_i64(a: __m128d) -> i64 {
    _mm_cvtsd_si64(a)
 }

@ -24,7 +24,7 @@ pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
-pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
+pub fn _mm_cvtss_i64(a: __m128) -> i64 {
    _mm_cvtss_si64(a)
 }

@ -35,8 +35,8 @@ pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
-pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
-    vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtss_u64(a: __m128) -> u64 {
+    unsafe { vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
@ -46,8 +46,8 @@ pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
-pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
-    vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsd_u64(a: __m128d) -> u64 {
+    unsafe { vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@ -57,9 +57,11 @@ pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
-pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
-    let b = b as f32;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
 }

 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@ -69,9 +71,11 @@ pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
-pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
-    let b = b as f64;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
 }

 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@ -81,9 +85,11 @@ pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
-pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
-    let b = b as f32;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
 }

 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@ -93,9 +99,11 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
-pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
-    let b = b as f64;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@ -105,8 +113,8 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si))]
-pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
-    vcvttsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsd_i64(a: __m128d) -> i64 {
+    unsafe { vcvttsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@ -116,8 +124,8 @@ pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2usi))]
-pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
-    vcvttsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsd_u64(a: __m128d) -> u64 {
+    unsafe { vcvttsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@ -127,8 +135,8 @@ pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si))]
-pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
-    vcvttss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttss_i64(a: __m128) -> i64 {
+    unsafe { vcvttss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@ -138,8 +146,8 @@ pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2usi))]
-pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
-    vcvttss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttss_u64(a: __m128) -> u64 {
+    unsafe { vcvttss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@ -156,11 +164,13 @@ pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let r = vcvtsi2sd64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtsi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
 }

 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@ -177,11 +187,13 @@ pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let r = vcvtsi2sd64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtsi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
 }

 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@ -198,11 +210,13 @@ pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtsi2ss64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
 }

 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@ -219,11 +233,13 @@ pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let r = vcvtusi2sd64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtusi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
 }

 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@ -240,11 +256,13 @@ pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtsi2ss64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
 }

 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@ -261,11 +279,13 @@ pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtusi2ss64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtusi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@ -282,10 +302,12 @@ pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2si64(a, ROUNDING)
+pub fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si64(a, ROUNDING)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@ -302,10 +324,12 @@ pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2si64(a, ROUNDING)
+pub fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si64(a, ROUNDING)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@ -322,10 +346,12 @@ pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2usi64(a, ROUNDING)
+pub fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2usi64(a, ROUNDING)
+    }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@ -342,10 +368,12 @@ pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2si64(a, ROUNDING)
+pub fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si64(a, ROUNDING)
+    }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@ -362,10 +390,12 @@ pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2si64(a, ROUNDING)
+pub fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si64(a, ROUNDING)
+    }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@ -382,10 +412,12 @@ pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2usi64(a, ROUNDING)
+pub fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2usi64(a, ROUNDING)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@ -397,10 +429,12 @@ pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2si64(a, SAE)
+pub fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si64(a, SAE)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@ -412,10 +446,12 @@ pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2si64(a, SAE)
+pub fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si64(a, SAE)
+    }
 }

 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@ -427,10 +463,12 @@ pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2usi64(a, SAE)
+pub fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2usi64(a, SAE)
+    }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@ -442,10 +480,12 @@ pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2si64(a, SAE)
+pub fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si64(a, SAE)
+    }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@ -457,10 +497,12 @@ pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2si64(a, SAE)
+pub fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si64(a, SAE)
+    }
 }

 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@ -472,10 +514,12 @@ pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2usi64(a, SAE)
+pub fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2usi64(a, SAE)
+    }
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86_64/avx512fp16.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512fp16.rs
@ -11,8 +11,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsi2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
-    vcvtsi642sh(a, b, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
+    unsafe { vcvtsi642sh(a, b, _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the signed 64-bit integer b to a half-precision (16-bit) floating-point element, store the
@ -33,9 +33,11 @@ pub unsafe fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtsi642sh(a, b, ROUNDING)
+pub fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsi642sh(a, b, ROUNDING)
+    }
 }

 /// Convert the unsigned 64-bit integer b to a half-precision (16-bit) floating-point element, store the
@ -47,8 +49,8 @@ pub unsafe fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtusi2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
-    vcvtusi642sh(a, b, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
+    unsafe { vcvtusi642sh(a, b, _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the unsigned 64-bit integer b to a half-precision (16-bit) floating-point element, store the
@ -69,9 +71,11 @@ pub unsafe fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtusi642sh(a, b, ROUNDING)
+pub fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtusi642sh(a, b, ROUNDING)
+    }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer, and store
@ -82,8 +86,8 @@ pub unsafe fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2si))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_i64(a: __m128h) -> i64 {
-    vcvtsh2si64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsh_i64(a: __m128h) -> i64 {
+    unsafe { vcvtsh2si64(a, _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer, and store
@ -103,9 +107,11 @@ pub unsafe fn _mm_cvtsh_i64(a: __m128h) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    vcvtsh2si64(a, ROUNDING)
+pub fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2si64(a, ROUNDING)
+    }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer, and store
@ -116,8 +122,8 @@ pub unsafe fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2usi))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_u64(a: __m128h) -> u64 {
-    vcvtsh2usi64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsh_u64(a: __m128h) -> u64 {
+    unsafe { vcvtsh2usi64(a, _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer, and store
@ -137,9 +143,11 @@ pub unsafe fn _mm_cvtsh_u64(a: __m128h) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
-    static_assert_rounding!(ROUNDING);
-    vcvtsh2usi64(a, ROUNDING)
+pub fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2usi64(a, ROUNDING)
+    }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer with truncation,
@ -150,8 +158,8 @@ pub unsafe fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttsh2si))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttsh_i64(a: __m128h) -> i64 {
-    vcvttsh2si64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsh_i64(a: __m128h) -> i64 {
+    unsafe { vcvttsh2si64(a, _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer with truncation,
@ -165,9 +173,11 @@ pub unsafe fn _mm_cvttsh_i64(a: __m128h) -> i64 {
 #[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
-    static_assert_sae!(SAE);
-    vcvttsh2si64(a, SAE)
+pub fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2si64(a, SAE)
+    }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer with truncation,
@ -178,8 +188,8 @@ pub unsafe fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttsh2usi))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttsh_u64(a: __m128h) -> u64 {
-    vcvttsh2usi64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsh_u64(a: __m128h) -> u64 {
+    unsafe { vcvttsh2usi64(a, _MM_FROUND_CUR_DIRECTION) }
 }

 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer with truncation,
@ -193,9 +203,11 @@ pub unsafe fn _mm_cvttsh_u64(a: __m128h) -> u64 {
 #[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtt_roundsh_u64<const SAE: i32>(a: __m128h) -> u64 {
-    static_assert_sae!(SAE);
-    vcvttsh2usi64(a, SAE)
+pub fn _mm_cvtt_roundsh_u64<const SAE: i32>(a: __m128h) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2usi64(a, SAE)
+    }
 }

 #[allow(improper_ctypes)]