From 7cc2822ce8acaa08748a121d473b964e55c44f18 Mon Sep 17 00:00:00 2001
From: usamoi <usamoi@outlook.com>
Date: Sun, 26 Jan 2025 20:56:33 +0800
Subject: [PATCH] mark AVX intrinsics as safe

Mark all AVX SIMD-computing intrinsics as safe, except for those involving memory operations and register operations.

`AVX+SHA512`, `AVX+SM3` and `AVX+SM4` intrinsics are missing.
---
 .../stdarch/crates/core_arch/src/x86/avx.rs   | 948 +++++++++---------
 .../crates/core_arch/src/x86_64/avx.rs        |   8 +-
 2 files changed, 503 insertions(+), 453 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index f78ca6d83913..5a8ed54fae1e 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -30,8 +30,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_add(a, b)
+pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_add(a, b) }
 }
 
 /// Adds packed single-precision (32-bit) floating-point elements in `a` and
@@ -42,8 +42,8 @@ pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
-    simd_add(a, b)
+pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_add(a, b) }
 }
 
 /// Computes the bitwise AND of a packed double-precision (64-bit)
@@ -55,10 +55,12 @@ pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
 // See https://github.com/rust-lang/stdarch/issues/71
 #[cfg_attr(test, assert_instr(vandp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
-    let a: u64x4 = transmute(a);
-    let b: u64x4 = transmute(b);
-    transmute(simd_and(a, b))
+pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(a, b))
+    }
 }
 
 /// Computes the bitwise AND of packed single-precision (32-bit) floating-point
@@ -69,10 +71,12 @@ pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
-    let a: u32x8 = transmute(a);
-    let b: u32x8 = transmute(b);
-    transmute(simd_and(a, b))
+pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(a, b))
+    }
 }
 
 /// Computes the bitwise OR packed double-precision (64-bit) floating-point
@@ -84,10 +88,12 @@ pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
 // See <https://github.com/rust-lang/stdarch/issues/71>.
 #[cfg_attr(test, assert_instr(vorp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
-    let a: u64x4 = transmute(a);
-    let b: u64x4 = transmute(b);
-    transmute(simd_or(a, b))
+pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_or(a, b))
+    }
 }
 
 /// Computes the bitwise OR packed single-precision (32-bit) floating-point
@@ -98,10 +104,12 @@ pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
-    let a: u32x8 = transmute(a);
-    let b: u32x8 = transmute(b);
-    transmute(simd_or(a, b))
+pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_or(a, b))
+    }
 }
 
 /// Shuffles double-precision (64-bit) floating-point elements within 128-bit
@@ -113,18 +121,20 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        b,
-        [
-            MASK as u32 & 0b1,
-            ((MASK as u32 >> 1) & 0b1) + 4,
-            ((MASK as u32 >> 2) & 0b1) + 2,
-            ((MASK as u32 >> 3) & 0b1) + 6,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 4,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 6,
+            ],
+        )
+    }
 }
 
 /// Shuffles single-precision (32-bit) floating-point elements in `a` within
@@ -136,22 +146,24 @@ pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m2
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        b,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11) + 8,
-            ((MASK as u32 >> 6) & 0b11) + 8,
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 12,
-            ((MASK as u32 >> 6) & 0b11) + 12,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        )
+    }
 }
 
 /// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
@@ -162,10 +174,12 @@ pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandnp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
-    let a: u64x4 = transmute(a);
-    let b: u64x4 = transmute(b);
-    transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+    }
 }
 
 /// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
@@ -177,10 +191,12 @@ pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
-    let a: u32x8 = transmute(a);
-    let b: u32x8 = transmute(b);
-    transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+    }
 }
 
 /// Compares packed double-precision (64-bit) floating-point elements
@@ -191,8 +207,8 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
-    vmaxpd(a, b)
+pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vmaxpd(a, b) }
 }
 
 /// Compares packed single-precision (32-bit) floating-point elements in `a`
@@ -203,8 +219,8 @@ pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
-    vmaxps(a, b)
+pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vmaxps(a, b) }
 }
 
 /// Compares packed double-precision (64-bit) floating-point elements
@@ -215,8 +231,8 @@ pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
-    vminpd(a, b)
+pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vminpd(a, b) }
 }
 
 /// Compares packed single-precision (32-bit) floating-point elements in `a`
@@ -227,8 +243,8 @@ pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
-    vminps(a, b)
+pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vminps(a, b) }
 }
 
 /// Multiplies packed double-precision (64-bit) floating-point elements
@@ -239,8 +255,8 @@ pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_mul(a, b)
+pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_mul(a, b) }
 }
 
 /// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
@@ -251,8 +267,8 @@ pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
-    simd_mul(a, b)
+pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_mul(a, b) }
 }
 
 /// Alternatively adds and subtracts packed double-precision (64-bit)
@@ -263,12 +279,14 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let add = simd_add(a, b);
-    let sub = simd_sub(a, b);
-    simd_shuffle!(add, sub, [4, 1, 6, 3])
+pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [4, 1, 6, 3])
+    }
 }
 
 /// Alternatively adds and subtracts packed single-precision (32-bit)
@@ -279,12 +297,14 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let add = simd_add(a, b);
-    let sub = simd_sub(a, b);
-    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let add = simd_add(a, b);
+        let sub = simd_sub(a, b);
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
 }
 
 /// Subtracts packed double-precision (64-bit) floating-point elements in `b`
@@ -295,8 +315,8 @@ pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_sub(a, b)
+pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_sub(a, b) }
 }
 
 /// Subtracts packed single-precision (32-bit) floating-point elements in `b`
@@ -307,8 +327,8 @@ pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
-    simd_sub(a, b)
+pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_sub(a, b) }
 }
 
 /// Computes the division of each of the 8 packed 32-bit floating-point elements
@@ -319,8 +339,8 @@ pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
-    simd_div(a, b)
+pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_div(a, b) }
 }
 
 /// Computes the division of each of the 4 packed 64-bit floating-point elements
@@ -331,8 +351,8 @@ pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_div(a, b)
+pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_div(a, b) }
 }
 
 /// Rounds packed double-precision (64-bit) floating point elements in `a`
@@ -353,9 +373,9 @@ pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
+pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
     static_assert_uimm_bits!(ROUNDING, 4);
-    roundpd256(a, ROUNDING)
+    unsafe { roundpd256(a, ROUNDING) }
 }
 
 /// Rounds packed double-precision (64-bit) floating point elements in `a`
@@ -366,8 +386,8 @@ pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
-    simd_ceil(a)
+pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+    unsafe { simd_ceil(a) }
 }
 
 /// Rounds packed double-precision (64-bit) floating point elements in `a`
@@ -378,8 +398,8 @@ pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
-    simd_floor(a)
+pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
+    unsafe { simd_floor(a) }
 }
 
 /// Rounds packed single-precision (32-bit) floating point elements in `a`
@@ -400,9 +420,9 @@ pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
+pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
     static_assert_uimm_bits!(ROUNDING, 4);
-    roundps256(a, ROUNDING)
+    unsafe { roundps256(a, ROUNDING) }
 }
 
 /// Rounds packed single-precision (32-bit) floating point elements in `a`
@@ -413,8 +433,8 @@ pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
-    simd_ceil(a)
+pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
+    unsafe { simd_ceil(a) }
 }
 
 /// Rounds packed single-precision (32-bit) floating point elements in `a`
@@ -425,8 +445,8 @@ pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
-    simd_floor(a)
+pub fn _mm256_floor_ps(a: __m256) -> __m256 {
+    unsafe { simd_floor(a) }
 }
 
 /// Returns the square root of packed single-precision (32-bit) floating point
@@ -437,8 +457,8 @@ pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
-    simd_fsqrt(a)
+pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Returns the square root of packed double-precision (64-bit) floating point
@@ -449,8 +469,8 @@ pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
-    simd_fsqrt(a)
+pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Blends packed double-precision (64-bit) floating-point elements from
@@ -465,18 +485,20 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
+pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM4, 4);
-    simd_shuffle!(
-        a,
-        b,
-        [
-            ((IMM4 as u32 >> 0) & 1) * 4 + 0,
-            ((IMM4 as u32 >> 1) & 1) * 4 + 1,
-            ((IMM4 as u32 >> 2) & 1) * 4 + 2,
-            ((IMM4 as u32 >> 3) & 1) * 4 + 3,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
+                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
+                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
+            ],
+        )
+    }
 }
 
 /// Blends packed single-precision (32-bit) floating-point elements from
@@ -488,22 +510,24 @@ pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256
 #[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
-    simd_shuffle!(
-        a,
-        b,
-        [
-            ((IMM8 as u32 >> 0) & 1) * 8 + 0,
-            ((IMM8 as u32 >> 1) & 1) * 8 + 1,
-            ((IMM8 as u32 >> 2) & 1) * 8 + 2,
-            ((IMM8 as u32 >> 3) & 1) * 8 + 3,
-            ((IMM8 as u32 >> 4) & 1) * 8 + 4,
-            ((IMM8 as u32 >> 5) & 1) * 8 + 5,
-            ((IMM8 as u32 >> 6) & 1) * 8 + 6,
-            ((IMM8 as u32 >> 7) & 1) * 8 + 7,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            b,
+            [
+                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
+                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
+                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
+                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
+                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
+                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
+                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
+            ],
+        )
+    }
 }
 
 /// Blends packed double-precision (64-bit) floating-point elements from
@@ -514,9 +538,11 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
-    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe {
+        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
+        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+    }
 }
 
 /// Blends packed single-precision (32-bit) floating-point elements from
@@ -527,9 +553,11 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
-    transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe {
+        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
+        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
+    }
 }
 
 /// Conditionally multiplies the packed single-precision (32-bit) floating-point
@@ -543,9 +571,9 @@ pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
-    vdpps(a, b, IMM8)
+    unsafe { vdpps(a, b, IMM8) }
 }
 
 /// Horizontal addition of adjacent pairs in the two packed vectors
@@ -558,8 +586,8 @@ pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
-    vhaddpd(a, b)
+pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vhaddpd(a, b) }
 }
 
 /// Horizontal addition of adjacent pairs in the two packed vectors
@@ -573,8 +601,8 @@ pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
-    vhaddps(a, b)
+pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vhaddps(a, b) }
 }
 
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -587,8 +615,8 @@ pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    vhsubpd(a, b)
+pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { vhsubpd(a, b) }
 }
 
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -602,8 +630,8 @@ pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
-    vhsubps(a, b)
+pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { vhsubps(a, b) }
 }
 
 /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@@ -614,10 +642,12 @@ pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
-    let a: u64x4 = transmute(a);
-    let b: u64x4 = transmute(b);
-    transmute(simd_xor(a, b))
+pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
 }
 
 /// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
@@ -628,10 +658,12 @@ pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
-    let a: u32x8 = transmute(a);
-    let b: u32x8 = transmute(b);
-    transmute(simd_xor(a, b))
+pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
 }
 
 /// Equal (ordered, non-signaling)
@@ -741,9 +773,9 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 #[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmppd(a, b, const { IMM5 as i8 })
+    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
 }
 
 /// Compares packed double-precision (64-bit) floating-point
@@ -756,9 +788,9 @@ pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmppd256(a, b, IMM5 as u8)
+    unsafe { vcmppd256(a, b, IMM5 as u8) }
 }
 
 /// Compares packed single-precision (32-bit) floating-point
@@ -771,9 +803,9 @@ pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d
 #[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmpps(a, b, const { IMM5 as i8 })
+    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
 }
 
 /// Compares packed single-precision (32-bit) floating-point
@@ -786,9 +818,9 @@ pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmpps256(a, b, const { IMM5 as u8 })
+    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
 }
 
 /// Compares the lower double-precision (64-bit) floating-point element in
@@ -803,9 +835,9 @@ pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmpsd(a, b, IMM5 as i8)
+    unsafe { vcmpsd(a, b, IMM5 as i8) }
 }
 
 /// Compares the lower single-precision (32-bit) floating-point element in
@@ -820,9 +852,9 @@ pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmpss(a, b, IMM5 as i8)
+    unsafe { vcmpss(a, b, IMM5 as i8) }
 }
 
 /// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
@@ -833,8 +865,8 @@ pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
-    simd_cast(a.as_i32x4())
+pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    unsafe { simd_cast(a.as_i32x4()) }
 }
 
 /// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
@@ -845,8 +877,8 @@ pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
-    simd_cast(a.as_i32x8())
+pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    unsafe { simd_cast(a.as_i32x8()) }
 }
 
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
@@ -857,8 +889,8 @@ pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
-    simd_cast(a)
+pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    unsafe { simd_cast(a) }
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -869,8 +901,8 @@ pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
-    transmute(vcvtps2dq(a))
+pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2dq(a)) }
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -881,8 +913,8 @@ pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
-    simd_cast(a)
+pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    unsafe { simd_cast(a) }
 }
 
 /// Returns the first element of the input vector of `[4 x double]`.
@@ -892,8 +924,8 @@ pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(movsd))] FIXME
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
-    simd_extract!(a, 0)
+pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
@@ -904,8 +936,8 @@ pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
-    transmute(vcvttpd2dq(a))
+pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq(a)) }
 }
 
 /// Converts packed double-precision (64-bit) floating-point elements in `a`
@@ -916,8 +948,8 @@ pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
-    transmute(vcvtpd2dq(a))
+pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2dq(a)) }
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -928,8 +960,8 @@ pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
-    transmute(vcvttps2dq(a))
+pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq(a)) }
 }
 
 /// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
@@ -944,13 +976,15 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 )]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
     static_assert_uimm_bits!(IMM1, 1);
-    simd_shuffle!(
-        a,
-        _mm256_undefined_ps(),
-        [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+        )
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
@@ -965,9 +999,9 @@ pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
 )]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
     static_assert_uimm_bits!(IMM1, 1);
-    simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
+    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
 }
 
 /// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@@ -981,10 +1015,12 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
 )]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
     static_assert_uimm_bits!(IMM1, 1);
-    let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
-    transmute(dst)
+    unsafe {
+        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
+        transmute(dst)
+    }
 }
 
 /// Extracts a 32-bit integer from `a`, selected with `INDEX`.
@@ -995,9 +1031,9 @@ pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
     static_assert_uimm_bits!(INDEX, 3);
-    simd_extract!(a.as_i32x8(), INDEX as u32)
+    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
 }
 
 /// Returns the first element of the input vector of `[8 x i32]`.
@@ -1006,8 +1042,8 @@ pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
-    simd_extract!(a.as_i32x8(), 0)
+pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    unsafe { simd_extract!(a.as_i32x8(), 0) }
 }
 
 /// Zeroes the contents of all XMM or YMM registers.
@@ -1041,8 +1077,8 @@ pub unsafe fn _mm256_zeroupper() {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
-    vpermilps256(a, b.as_i32x8())
+pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+    unsafe { vpermilps256(a, b.as_i32x8()) }
 }
 
 /// Shuffles single-precision (32-bit) floating-point elements in `a`
@@ -1053,8 +1089,8 @@ pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
-    vpermilps(a, b.as_i32x4())
+pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+    unsafe { vpermilps(a, b.as_i32x4()) }
 }
 
 /// Shuffles single-precision (32-bit) floating-point elements in `a`
@@ -1066,22 +1102,24 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 #[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
+pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
-    simd_shuffle!(
-        a,
-        _mm256_undefined_ps(),
-        [
-            (IMM8 as u32 >> 0) & 0b11,
-            (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11,
-            (IMM8 as u32 >> 6) & 0b11,
-            ((IMM8 as u32 >> 0) & 0b11) + 4,
-            ((IMM8 as u32 >> 2) & 0b11) + 4,
-            ((IMM8 as u32 >> 4) & 0b11) + 4,
-            ((IMM8 as u32 >> 6) & 0b11) + 4,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                ((IMM8 as u32 >> 0) & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
 }
 
 /// Shuffles single-precision (32-bit) floating-point elements in `a`
@@ -1093,18 +1131,20 @@ pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
-    simd_shuffle!(
-        a,
-        _mm_undefined_ps(),
-        [
-            (IMM8 as u32 >> 0) & 0b11,
-            (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11,
-            (IMM8 as u32 >> 6) & 0b11,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ps(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        )
+    }
 }
 
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
@@ -1115,8 +1155,8 @@ pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
-    vpermilpd256(a, b.as_i64x4())
+pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+    unsafe { vpermilpd256(a, b.as_i64x4()) }
 }
 
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
@@ -1127,8 +1167,8 @@ pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
-    vpermilpd(a, b.as_i64x2())
+pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+    unsafe { vpermilpd(a, b.as_i64x2()) }
 }
 
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
@@ -1140,18 +1180,20 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 #[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
+pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM4, 4);
-    simd_shuffle!(
-        a,
-        _mm256_undefined_pd(),
-        [
-            ((IMM4 as u32 >> 0) & 1),
-            ((IMM4 as u32 >> 1) & 1),
-            ((IMM4 as u32 >> 2) & 1) + 2,
-            ((IMM4 as u32 >> 3) & 1) + 2,
-        ],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_pd(),
+            [
+                ((IMM4 as u32 >> 0) & 1),
+                ((IMM4 as u32 >> 1) & 1),
+                ((IMM4 as u32 >> 2) & 1) + 2,
+                ((IMM4 as u32 >> 3) & 1) + 2,
+            ],
+        )
+    }
 }
 
 /// Shuffles double-precision (64-bit) floating-point elements in `a`
@@ -1163,13 +1205,15 @@ pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
+pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM2, 2);
-    simd_shuffle!(
-        a,
-        _mm_undefined_pd(),
-        [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_pd(),
+            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        )
+    }
 }
 
 /// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
@@ -1181,9 +1225,9 @@ pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
-    vperm2f128ps256(a, b, IMM8 as i8)
+    unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
 }
 
 /// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@@ -1195,9 +1239,9 @@ pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> _
 #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM8, 8);
-    vperm2f128pd256(a, b, IMM8 as i8)
+    unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
 }
 
 /// Shuffles 128-bits (composed of integer data) selected by `imm8`
@@ -1209,9 +1253,9 @@ pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) ->
 #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
+    unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
 }
 
 /// Broadcasts a single-precision (32-bit) floating-point element from memory
@@ -1290,13 +1334,15 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 )]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
     static_assert_uimm_bits!(IMM1, 1);
-    simd_shuffle!(
-        a,
-        _mm256_castps128_ps256(b),
-        [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_castps128_ps256(b),
+            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+        )
+    }
 }
 
 /// Copies `a` to result, then inserts 128 bits (composed of 2 packed
@@ -1312,13 +1358,15 @@ pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m
 )]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
     static_assert_uimm_bits!(IMM1, 1);
-    simd_shuffle!(
-        a,
-        _mm256_castpd128_pd256(b),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
-    )
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_castpd128_pd256(b),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        )
+    }
 }
 
 /// Copies `a` to result, then inserts 128 bits from `b` into result
@@ -1333,14 +1381,16 @@ pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> _
 )]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
     static_assert_uimm_bits!(IMM1, 1);
-    let dst: i64x4 = simd_shuffle!(
-        a.as_i64x4(),
-        _mm256_castsi128_si256(b).as_i64x4(),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
-    );
-    transmute(dst)
+    unsafe {
+        let dst: i64x4 = simd_shuffle!(
+            a.as_i64x4(),
+            _mm256_castsi128_si256(b).as_i64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        );
+        transmute(dst)
+    }
 }
 
 /// Copies `a` to result, and inserts the 8-bit integer `i` into result
@@ -1352,9 +1402,9 @@ pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
     static_assert_uimm_bits!(INDEX, 5);
-    transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i))
+    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
 }
 
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
@@ -1366,9 +1416,9 @@ pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
+pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
     static_assert_uimm_bits!(INDEX, 4);
-    transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i))
+    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
 }
 
 /// Copies `a` to result, and inserts the 32-bit integer `i` into result
@@ -1380,9 +1430,9 @@ pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m25
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
+pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
     static_assert_uimm_bits!(INDEX, 3);
-    transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i))
+    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
 }
 
 /// Loads 256-bits (composed of 4 packed double-precision (64-bit)
@@ -1673,8 +1723,8 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
-    simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
+pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
@@ -1685,8 +1735,8 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
-    simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
+pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
@@ -1697,8 +1747,8 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
-    simd_shuffle!(a, a, [0, 0, 2, 2])
+pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
 }
 
 /// Loads 256-bits of integer data from unaligned memory into result.
@@ -1807,8 +1857,8 @@ pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrcpps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
-    vrcpps(a)
+pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
+    unsafe { vrcpps(a) }
 }
 
 /// Computes the approximate reciprocal square root of packed single-precision
@@ -1820,8 +1870,8 @@ pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrsqrtps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
-    vrsqrtps(a)
+pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+    unsafe { vrsqrtps(a) }
 }
 
 /// Unpacks and interleave double-precision (64-bit) floating-point elements
@@ -1832,8 +1882,8 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_shuffle!(a, b, [1, 5, 3, 7])
+pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
 }
 
 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@@ -1844,8 +1894,8 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
-    simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
+pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
 }
 
 /// Unpacks and interleave double-precision (64-bit) floating-point elements
@@ -1856,8 +1906,8 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_shuffle!(a, b, [0, 4, 2, 6])
+pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
 }
 
 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@@ -1868,8 +1918,8 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
-    simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
+pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -1882,8 +1932,8 @@ pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    ptestz256(a.as_i64x4(), b.as_i64x4())
+pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -1896,8 +1946,8 @@ pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
-    ptestc256(a.as_i64x4(), b.as_i64x4())
+pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -1911,8 +1961,8 @@ pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
-    ptestnzc256(a.as_i64x4(), b.as_i64x4())
+pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
@@ -1928,8 +1978,8 @@ pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
-    vtestzpd256(a, b)
+pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+    unsafe { vtestzpd256(a, b) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
@@ -1945,8 +1995,8 @@ pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
-    vtestcpd256(a, b)
+pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+    unsafe { vtestcpd256(a, b) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
@@ -1963,8 +2013,8 @@ pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
-    vtestnzcpd256(a, b)
+pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+    unsafe { vtestnzcpd256(a, b) }
 }
 
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -1980,8 +2030,8 @@ pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
-    vtestzpd(a, b)
+pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { vtestzpd(a, b) }
 }
 
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -1997,8 +2047,8 @@ pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
-    vtestcpd(a, b)
+pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { vtestcpd(a, b) }
 }
 
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2015,8 +2065,8 @@ pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
-    vtestnzcpd(a, b)
+pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+    unsafe { vtestnzcpd(a, b) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
@@ -2032,8 +2082,8 @@ pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
-    vtestzps256(a, b)
+pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+    unsafe { vtestzps256(a, b) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
@@ -2049,8 +2099,8 @@ pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
-    vtestcps256(a, b)
+pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+    unsafe { vtestcps256(a, b) }
 }
 
 /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
@@ -2067,8 +2117,8 @@ pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
-    vtestnzcps256(a, b)
+pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+    unsafe { vtestnzcps256(a, b) }
 }
 
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -2084,8 +2134,8 @@ pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
-    vtestzps(a, b)
+pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+    unsafe { vtestzps(a, b) }
 }
 
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -2101,8 +2151,8 @@ pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
-    vtestcps(a, b)
+pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+    unsafe { vtestcps(a, b) }
 }
 
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -2119,8 +2169,8 @@ pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
-    vtestnzcps(a, b)
+pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+    unsafe { vtestnzcps(a, b) }
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -2132,11 +2182,13 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
+pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
     // Propagate the highest bit to the rest, because simd_bitmask
     // requires all-1 or all-0.
-    let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
-    simd_bitmask::<i64x4, u8>(mask).into()
+    unsafe {
+        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
+        simd_bitmask::<i64x4, u8>(mask).into()
+    }
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -2148,11 +2200,13 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
+pub fn _mm256_movemask_ps(a: __m256) -> i32 {
     // Propagate the highest bit to the rest, because simd_bitmask
     // requires all-1 or all-0.
-    let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
-    simd_bitmask::<i32x8, u8>(mask).into()
+    unsafe {
+        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
+        simd_bitmask::<i32x8, u8>(mask).into()
+    }
 }
 
 /// Returns vector of type __m256d with all elements set to zero.
@@ -2162,8 +2216,8 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setzero_pd() -> __m256d {
-    const { mem::zeroed() }
+pub fn _mm256_setzero_pd() -> __m256d {
+    const { unsafe { mem::zeroed() } }
 }
 
 /// Returns vector of type __m256 with all elements set to zero.
@@ -2173,8 +2227,8 @@ pub unsafe fn _mm256_setzero_pd() -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setzero_ps() -> __m256 {
-    const { mem::zeroed() }
+pub fn _mm256_setzero_ps() -> __m256 {
+    const { unsafe { mem::zeroed() } }
 }
 
 /// Returns vector of type __m256i with all elements set to zero.
@@ -2184,8 +2238,8 @@ pub unsafe fn _mm256_setzero_ps() -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxor))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setzero_si256() -> __m256i {
-    const { mem::zeroed() }
+pub fn _mm256_setzero_si256() -> __m256i {
+    const { unsafe { mem::zeroed() } }
 }
 
 /// Sets packed double-precision (64-bit) floating-point elements in returned
@@ -2197,7 +2251,7 @@ pub unsafe fn _mm256_setzero_si256() -> __m256i {
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
     _mm256_setr_pd(d, c, b, a)
 }
 
@@ -2209,16 +2263,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_ps(
-    a: f32,
-    b: f32,
-    c: f32,
-    d: f32,
-    e: f32,
-    f: f32,
-    g: f32,
-    h: f32,
-) -> __m256 {
+pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
     _mm256_setr_ps(h, g, f, e, d, c, b, a)
 }
 
@@ -2229,7 +2274,7 @@ pub unsafe fn _mm256_set_ps(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_epi8(
+pub fn _mm256_set_epi8(
     e00: i8,
     e01: i8,
     e02: i8,
@@ -2279,7 +2324,7 @@ pub unsafe fn _mm256_set_epi8(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_epi16(
+pub fn _mm256_set_epi16(
     e00: i16,
     e01: i16,
     e02: i16,
@@ -2313,7 +2358,7 @@ pub unsafe fn _mm256_set_epi16(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_epi32(
+pub fn _mm256_set_epi32(
     e0: i32,
     e1: i32,
     e2: i32,
@@ -2333,7 +2378,7 @@ pub unsafe fn _mm256_set_epi32(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     _mm256_setr_epi64x(d, c, b, a)
 }
 
@@ -2345,7 +2390,7 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
     __m256d([a, b, c, d])
 }
 
@@ -2357,16 +2402,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_ps(
-    a: f32,
-    b: f32,
-    c: f32,
-    d: f32,
-    e: f32,
-    f: f32,
-    g: f32,
-    h: f32,
-) -> __m256 {
+pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
     __m256([a, b, c, d, e, f, g, h])
 }
 
@@ -2378,7 +2414,7 @@ pub unsafe fn _mm256_setr_ps(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_epi8(
+pub fn _mm256_setr_epi8(
     e00: i8,
     e01: i8,
     e02: i8,
@@ -2412,13 +2448,15 @@ pub unsafe fn _mm256_setr_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    #[rustfmt::skip]
-    transmute(i8x32::new(
-        e00, e01, e02, e03, e04, e05, e06, e07,
-        e08, e09, e10, e11, e12, e13, e14, e15,
-        e16, e17, e18, e19, e20, e21, e22, e23,
-        e24, e25, e26, e27, e28, e29, e30, e31,
-    ))
+    unsafe {
+        #[rustfmt::skip]
+        transmute(i8x32::new(
+            e00, e01, e02, e03, e04, e05, e06, e07,
+            e08, e09, e10, e11, e12, e13, e14, e15,
+            e16, e17, e18, e19, e20, e21, e22, e23,
+            e24, e25, e26, e27, e28, e29, e30, e31,
+        ))
+    }
 }
 
 /// Sets packed 16-bit integers in returned vector with the supplied values in
@@ -2429,7 +2467,7 @@ pub unsafe fn _mm256_setr_epi8(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_epi16(
+pub fn _mm256_setr_epi16(
     e00: i16,
     e01: i16,
     e02: i16,
@@ -2447,13 +2485,15 @@ pub unsafe fn _mm256_setr_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    #[rustfmt::skip]
-    transmute(i16x16::new(
-        e00, e01, e02, e03,
-        e04, e05, e06, e07,
-        e08, e09, e10, e11,
-        e12, e13, e14, e15,
-    ))
+    unsafe {
+        #[rustfmt::skip]
+        transmute(i16x16::new(
+            e00, e01, e02, e03,
+            e04, e05, e06, e07,
+            e08, e09, e10, e11,
+            e12, e13, e14, e15,
+        ))
+    }
 }
 
 /// Sets packed 32-bit integers in returned vector with the supplied values in
@@ -2464,7 +2504,7 @@ pub unsafe fn _mm256_setr_epi16(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_epi32(
+pub fn _mm256_setr_epi32(
     e0: i32,
     e1: i32,
     e2: i32,
@@ -2474,7 +2514,7 @@ pub unsafe fn _mm256_setr_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
 }
 
 /// Sets packed 64-bit integers in returned vector with the supplied values in
@@ -2485,8 +2525,8 @@ pub unsafe fn _mm256_setr_epi32(
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    transmute(i64x4::new(a, b, c, d))
+pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    unsafe { transmute(i64x4::new(a, b, c, d)) }
 }
 
 /// Broadcasts double-precision (64-bit) floating-point value `a` to all
@@ -2497,7 +2537,7 @@ pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
+pub fn _mm256_set1_pd(a: f64) -> __m256d {
     _mm256_setr_pd(a, a, a, a)
 }
 
@@ -2509,7 +2549,7 @@ pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
+pub fn _mm256_set1_ps(a: f32) -> __m256 {
     _mm256_setr_ps(a, a, a, a, a, a, a, a)
 }
 
@@ -2521,7 +2561,7 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
+pub fn _mm256_set1_epi8(a: i8) -> __m256i {
     #[rustfmt::skip]
     _mm256_setr_epi8(
         a, a, a, a, a, a, a, a,
@@ -2541,7 +2581,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
 #[cfg_attr(test, assert_instr(vinsertf128))]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
+pub fn _mm256_set1_epi16(a: i16) -> __m256i {
     _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
@@ -2553,7 +2593,7 @@ pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
+pub fn _mm256_set1_epi32(a: i32) -> __m256i {
     _mm256_setr_epi32(a, a, a, a, a, a, a, a)
 }
 
@@ -2567,7 +2607,7 @@ pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
+pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
     _mm256_setr_epi64x(a, a, a, a)
 }
 
@@ -2579,8 +2619,8 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
-    transmute(a)
+pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type __m256 to type __m256d.
@@ -2591,8 +2631,8 @@ pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
-    transmute(a)
+pub fn _mm256_castps_pd(a: __m256) -> __m256d {
+    unsafe { transmute(a) }
 }
 
 /// Casts vector of type __m256 to type __m256i.
@@ -2603,8 +2643,8 @@ pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
-    transmute(a)
+pub fn _mm256_castps_si256(a: __m256) -> __m256i {
+    unsafe { transmute(a) }
 }
 
 /// Casts vector of type __m256i to type __m256.
@@ -2615,8 +2655,8 @@ pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
-    transmute(a)
+pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    unsafe { transmute(a) }
 }
 
 /// Casts vector of type __m256d to type __m256i.
@@ -2627,8 +2667,8 @@ pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
-    transmute(a)
+pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    unsafe { transmute(a) }
 }
 
 /// Casts vector of type __m256i to type __m256d.
@@ -2639,8 +2679,8 @@ pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
-    transmute(a)
+pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(a) }
 }
 
 /// Casts vector of type __m256 to type __m128.
@@ -2651,8 +2691,8 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
-    simd_shuffle!(a, a, [0, 1, 2, 3])
+pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
 }
 
 /// Casts vector of type __m256d to type __m128d.
@@ -2663,8 +2703,8 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
-    simd_shuffle!(a, a, [0, 1])
+pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
 }
 
 /// Casts vector of type __m256i to type __m128i.
@@ -2675,10 +2715,12 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
-    let a = a.as_i64x4();
-    let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
-    transmute(dst)
+pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x4();
+        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute(dst)
+    }
 }
 
 /// Casts vector of type __m128 to type __m256;
@@ -2690,8 +2732,8 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
-    simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4])
+pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
 }
 
 /// Casts vector of type __m128d to type __m256d;
@@ -2703,8 +2745,8 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
-    simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2])
+pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
 }
 
 /// Casts vector of type __m128i to type __m256i;
@@ -2716,11 +2758,13 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
-    let a = a.as_i64x2();
-    let undefined = i64x2::ZERO;
-    let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
-    transmute(dst)
+pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x2();
+        let undefined = i64x2::ZERO;
+        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
+        transmute(dst)
+    }
 }
 
 /// Constructs a 256-bit floating-point vector of `[8 x float]` from a
@@ -2733,8 +2777,8 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
-    simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
 }
 
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
@@ -2747,10 +2791,12 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
-    let b = i64x2::ZERO;
-    let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
-    transmute(dst)
+pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    unsafe {
+        let b = i64x2::ZERO;
+        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
+        transmute(dst)
+    }
 }
 
 /// Constructs a 256-bit floating-point vector of `[4 x double]` from a
@@ -2764,8 +2810,8 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
-    simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3])
+pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
 }
 
 /// Returns vector of type `__m256` with indeterminate elements.
@@ -2777,8 +2823,8 @@ pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_undefined_ps() -> __m256 {
-    const { mem::zeroed() }
+pub fn _mm256_undefined_ps() -> __m256 {
+    const { unsafe { mem::zeroed() } }
 }
 
 /// Returns vector of type `__m256d` with indeterminate elements.
@@ -2790,8 +2836,8 @@ pub unsafe fn _mm256_undefined_ps() -> __m256 {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_undefined_pd() -> __m256d {
-    const { mem::zeroed() }
+pub fn _mm256_undefined_pd() -> __m256d {
+    const { unsafe { mem::zeroed() } }
 }
 
 /// Returns vector of type __m256i with with indeterminate elements.
@@ -2803,8 +2849,8 @@ pub unsafe fn _mm256_undefined_pd() -> __m256d {
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_undefined_si256() -> __m256i {
-    const { mem::zeroed() }
+pub fn _mm256_undefined_si256() -> __m256i {
+    const { unsafe { mem::zeroed() } }
 }
 
 /// Sets packed __m256 returned vector with the supplied values.
@@ -2814,8 +2860,8 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
-    simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
 }
 
 /// Sets packed __m256d returned vector with the supplied values.
@@ -2825,10 +2871,12 @@ pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
-    let hi: __m128 = transmute(hi);
-    let lo: __m128 = transmute(lo);
-    transmute(_mm256_set_m128(hi, lo))
+pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    unsafe {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
 }
 
 /// Sets packed __m256i returned vector with the supplied values.
@@ -2838,10 +2886,12 @@ pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
-    let hi: __m128 = transmute(hi);
-    let lo: __m128 = transmute(lo);
-    transmute(_mm256_set_m128(hi, lo))
+pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    unsafe {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
 }
 
 /// Sets packed __m256 returned vector with the supplied values.
@@ -2851,7 +2901,7 @@ pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
     _mm256_set_m128(hi, lo)
 }
 
@@ -2862,7 +2912,7 @@ pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
     _mm256_set_m128d(hi, lo)
 }
 
@@ -2873,7 +2923,7 @@ pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
     _mm256_set_m128i(hi, lo)
 }
 
@@ -2978,8 +3028,8 @@ pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(movss))] FIXME
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
-    simd_extract!(a, 0)
+pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 // LLVM intrinsics used in the above functions
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx.rs b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
index 71b8290eb877..b494385e4a61 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx.rs
@@ -24,9 +24,9 @@ use crate::{core_arch::x86::*, mem::transmute};
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m256i {
+pub fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m256i {
     static_assert_uimm_bits!(INDEX, 2);
-    transmute(simd_insert!(a.as_i64x4(), INDEX as u32, i))
+    unsafe { transmute(simd_insert!(a.as_i64x4(), INDEX as u32, i)) }
 }
 
 /// Extracts a 64-bit integer from `a`, selected with `INDEX`.
@@ -37,9 +37,9 @@ pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m25
 #[rustc_legacy_const_generics(1)]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi64<const INDEX: i32>(a: __m256i) -> i64 {
+pub fn _mm256_extract_epi64<const INDEX: i32>(a: __m256i) -> i64 {
     static_assert_uimm_bits!(INDEX, 2);
-    simd_extract!(a.as_i64x4(), INDEX as u32)
+    unsafe { simd_extract!(a.as_i64x4(), INDEX as u32) }
 }
 
 #[cfg(test)]