x86: add unsafe to all x86 vendor intrinsics

Also, add missing assert_instr tests to each intrinsic, where possible.
2017-09-26 21:53:50 -04:00 · 2017-09-26 21:53:50 -04:00 · 6dfc65289c
commit 6dfc65289c
parent ff9e960628
12 changed files with 1611 additions and 1213 deletions
--- a/library/stdarch/examples/play.rs
+++ b/library/stdarch/examples/play.rs
@ -24,9 +24,11 @@ mod example {
        haystack.resize(16, 0);
        let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));

-        vendor::_mm_cmpestri(
-            vneedle, needle_len as i32, vhaystack, hay_len as i32,
-            vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
+        unsafe {
+            vendor::_mm_cmpestri(
+                vneedle, needle_len as i32, vhaystack, hay_len as i32,
+                vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
+        }
    }

    pub fn main() {
--- a/library/stdarch/src/x86/abm.rs
+++ b/library/stdarch/src/x86/abm.rs
@ -19,7 +19,7 @@ use stdsimd_test::assert_instr;
 #[inline(always)]
 #[target_feature = "+lzcnt"]
 #[cfg_attr(test, assert_instr(lzcnt))]
-pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
+pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }

 /// Counts the leading most significant zero bits.
 ///
@ -27,19 +27,19 @@ pub fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
 #[inline(always)]
 #[target_feature = "+lzcnt"]
 #[cfg_attr(test, assert_instr(lzcnt))]
-pub fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
+pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }

 /// Counts the bits that are set.
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub fn _popcnt32(x: u32) -> u32 { x.count_ones() }
+pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }

 /// Counts the bits that are set.
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
+pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }

 #[cfg(test)]
 mod tests {
@ -49,21 +49,21 @@ mod tests {

    #[simd_test = "lzcnt"]
    fn _lzcnt_u32() {
-        assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
+        assert_eq!(unsafe { abm::_lzcnt_u32(0b0101_1010u32) }, 25u32);
    }

    #[simd_test = "lzcnt"]
    fn _lzcnt_u64() {
-        assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
+        assert_eq!(unsafe { abm::_lzcnt_u64(0b0101_1010u64) }, 57u64);
    }

    #[simd_test = "popcnt"]
    fn _popcnt32() {
-        assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
+        assert_eq!(unsafe { abm::_popcnt32(0b0101_1010u32) }, 4);
    }

    #[simd_test = "popcnt"]
    fn _popcnt64() {
-        assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
+        assert_eq!(unsafe { abm::_popcnt64(0b0101_1010u64) }, 4);
    }
 }
--- a/library/stdarch/src/x86/avx.rs
+++ b/library/stdarch/src/x86/avx.rs
@ -1,14 +1,14 @@
-use v256::*;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use v256::*;
+
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
+pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
    a + b
 }

@ -16,7 +16,7 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
+pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
    a + b
 }

@ -25,7 +25,7 @@ pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
+pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
    a * b
 }

@ -33,7 +33,7 @@ pub fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
+pub unsafe fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
    a * b
 }

@ -42,8 +42,8 @@ pub fn _mm256_mul_ps(a: f32x8, b: f32x8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddsubpd))]
-pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
-    unsafe { addsubpd256(a, b) }
+pub unsafe fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
+    addsubpd256(a, b)
 }

 /// Alternatively add and subtract packed single-precision (32-bit)
@ -51,8 +51,8 @@ pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vaddsubps))]
-pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
-    unsafe { addsubps256(a, b) }
+pub unsafe fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
+    addsubps256(a, b)
 }

 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
@ -60,7 +60,7 @@ pub fn _mm256_addsub_ps(a: f32x8, b: f32x8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
+pub unsafe fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
    a - b
 }

@ -69,25 +69,24 @@ pub fn _mm256_sub_pd(a: f64x4, b: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
+pub unsafe fn _mm256_sub_ps(a: f32x8, b: f32x8) -> f32x8 {
    a - b
 }

 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// according to the flag `b`. The value of `b` may be as follows:
+///
+/// ```ignore
 /// 0x00: Round to the nearest whole number.
 /// 0x01: Round down, toward negative infinity.
 /// 0x02: Round up, toward positive infinity.
 /// 0x03: Truncate the values.
-/// For a few additional values options, check the LLVM docs:
-/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+/// ```
 #[inline(always)]
 #[target_feature = "+avx"]
-pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
+pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { roundpd256(a, $imm8) }
-        }
+        ($imm8:expr) => { roundpd256(a, $imm8) }
    }
    constify_imm8!(b, call)
 }
@ -96,7 +95,7 @@ pub fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
 #[cfg_attr(test, assert_instr(vroundpd))]
 #[target_feature = "+avx"]
 fn test_mm256_round_pd(a: f64x4) -> f64x4 {
-    _mm256_round_pd(a, 0x3)
+    unsafe { _mm256_round_pd(a, 0x3) }
 }

 /// Round packed double-precision (64-bit) floating point elements in `a` toward
@ -104,8 +103,8 @@ fn test_mm256_round_pd(a: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vroundpd))]
-pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
-    unsafe { roundpd256(a, 0x02) }
+pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
+    roundpd256(a, 0x02)
 }

 /// Round packed double-precision (64-bit) floating point elements in `a` toward
@ -113,8 +112,8 @@ pub fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vroundpd))]
-pub fn _mm256_floor_pd(a: f64x4) -> f64x4 {
-    unsafe { roundpd256(a, 0x01) }
+pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
+    roundpd256(a, 0x01)
 }

 /// LLVM intrinsics used in the above functions
@ -139,7 +138,7 @@ mod tests {
    fn _mm256_add_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_add_pd(a, b);
+        let r = unsafe { avx::_mm256_add_pd(a, b) };
        let e = f64x4::new(6.0, 8.0, 10.0, 12.0);
        assert_eq!(r, e);
    }
@ -148,7 +147,7 @@ mod tests {
    fn _mm256_add_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = avx::_mm256_add_ps(a, b);
+        let r = unsafe { avx::_mm256_add_ps(a, b) };
        let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0);
        assert_eq!(r, e);
    }
@ -157,7 +156,7 @@ mod tests {
    fn _mm256_mul_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_mul_pd(a, b);
+        let r = unsafe { avx::_mm256_mul_pd(a, b) };
        let e = f64x4::new(5.0, 12.0, 21.0, 32.0);
        assert_eq!(r, e);
    }
@ -166,7 +165,7 @@ mod tests {
    fn _mm256_mul_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
-        let r = avx::_mm256_mul_ps(a, b);
+        let r = unsafe { avx::_mm256_mul_ps(a, b) };
        let e = f32x8::new(9.0, 20.0, 33.0, 48.0, 65.0, 84.0, 105.0, 128.0);
        assert_eq!(r, e);
    }
@ -175,7 +174,7 @@ mod tests {
    fn _mm256_addsub_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_addsub_pd(a, b);
+        let r = unsafe { avx::_mm256_addsub_pd(a, b) };
        let e = f64x4::new(-4.0, 8.0, -4.0, 12.0);
        assert_eq!(r, e);
    }
@ -184,7 +183,7 @@ mod tests {
    fn _mm256_addsub_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_addsub_ps(a, b);
+        let r = unsafe { avx::_mm256_addsub_ps(a, b) };
        let e = f32x8::new(-4.0, 8.0, -4.0, 12.0, -4.0, 8.0, -4.0, 12.0);
        assert_eq!(r, e);
    }
@ -193,7 +192,7 @@ mod tests {
    fn _mm256_sub_pd() {
        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = avx::_mm256_sub_pd(a, b);
+        let r = unsafe { avx::_mm256_sub_pd(a, b) };
        let e = f64x4::new(-4.0,-4.0,-4.0,-4.0);
        assert_eq!(r, e);
    }
@ -202,7 +201,7 @@ mod tests {
    fn _mm256_sub_ps() {
        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, -1.0, -2.0, -3.0, -4.0);
        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 3.0, 2.0, 1.0, 0.0);
-        let r = avx::_mm256_sub_ps(a, b);
+        let r = unsafe { avx::_mm256_sub_ps(a, b) };
        let e = f32x8::new(-4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0, -4.0);
        assert_eq!(r, e);
    }
@ -210,9 +209,9 @@ mod tests {
    #[simd_test = "avx"]
    fn _mm256_round_pd() {
        let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
-        let result_closest = avx::_mm256_round_pd(a, 0b00000000);
-        let result_down = avx::_mm256_round_pd(a, 0b00000001);
-        let result_up = avx::_mm256_round_pd(a, 0b00000010);
+        let result_closest = unsafe { avx::_mm256_round_pd(a, 0b00000000) };
+        let result_down = unsafe { avx::_mm256_round_pd(a, 0b00000001) };
+        let result_up = unsafe { avx::_mm256_round_pd(a, 0b00000010) };
        let expected_closest = f64x4::new(2.0, 2.0, 4.0, -1.0);
        let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
        let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
@ -224,7 +223,7 @@ mod tests {
    #[simd_test = "avx"]
    fn _mm256_floor_pd() {
        let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
-        let result_down = avx::_mm256_floor_pd(a);
+        let result_down = unsafe { avx::_mm256_floor_pd(a) };
        let expected_down = f64x4::new(1.0, 2.0, 3.0, -2.0);
        assert_eq!(result_down, expected_down);
    }
@ -232,7 +231,7 @@ mod tests {
    #[simd_test = "avx"]
    fn _mm256_ceil_pd() {
        let a = f64x4::new(1.55, 2.2, 3.99, -1.2);
-        let result_up = avx::_mm256_ceil_pd(a, );
+        let result_up = unsafe { avx::_mm256_ceil_pd(a) };
        let expected_up = f64x4::new(2.0, 3.0, 4.0, -1.0);
        assert_eq!(result_up, expected_up);
    }
--- a/library/stdarch/src/x86/avx2.rs
+++ b/library/stdarch/src/x86/avx2.rs
--- a/library/stdarch/src/x86/bmi.rs
+++ b/library/stdarch/src/x86/bmi.rs
@ -10,20 +10,12 @@
 #[cfg(test)]
 use stdsimd_test::assert_instr;

-#[allow(dead_code)]
-extern "C" {
-    #[link_name="llvm.x86.bmi.bextr.32"]
-    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
-    #[link_name="llvm.x86.bmi.bextr.64"]
-    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
-}
-
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
-pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
+pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
 }

@ -33,7 +25,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
-pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
+pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
 }

@ -45,8 +37,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
-pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
-    unsafe { x86_bmi_bextr_32(a, control) }
+pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
+    x86_bmi_bextr_32(a, control)
 }

 /// Extracts bits of `a` specified by `control` into
@ -58,15 +50,15 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
-pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
-    unsafe { x86_bmi_bextr_64(a, control) }
+pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
+    x86_bmi_bextr_64(a, control)
 }

 /// Bitwise logical `AND` of inverted `a` with `b`.
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(andn))]
-pub fn _andn_u32(a: u32, b: u32) -> u32 {
+pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
    !a & b
 }

@ -74,7 +66,7 @@ pub fn _andn_u32(a: u32, b: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(andn))]
-pub fn _andn_u64(a: u64, b: u64) -> u64 {
+pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
    !a & b
 }

@ -82,7 +74,7 @@ pub fn _andn_u64(a: u64, b: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsi))]
-pub fn _blsi_u32(x: u32) -> u32 {
+pub unsafe fn _blsi_u32(x: u32) -> u32 {
    x & x.wrapping_neg()
 }

@ -91,7 +83,7 @@ pub fn _blsi_u32(x: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsi))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsi_u64(x: u64) -> u64 {
+pub unsafe fn _blsi_u64(x: u64) -> u64 {
    x & x.wrapping_neg()
 }

@ -99,7 +91,7 @@ pub fn _blsi_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsmsk))]
-pub fn _blsmsk_u32(x: u32) -> u32 {
+pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
    x ^ (x.wrapping_sub(1u32))
 }

@ -108,7 +100,7 @@ pub fn _blsmsk_u32(x: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsmsk_u64(x: u64) -> u64 {
+pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
    x ^ (x.wrapping_sub(1u64))
 }

@ -118,7 +110,7 @@ pub fn _blsmsk_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsr))]
-pub fn _blsr_u32(x: u32) -> u32 {
+pub unsafe fn _blsr_u32(x: u32) -> u32 {
    x & (x.wrapping_sub(1))
 }

@ -129,7 +121,7 @@ pub fn _blsr_u32(x: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsr))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsr_u64(x: u64) -> u64 {
+pub unsafe fn _blsr_u64(x: u64) -> u64 {
    x & (x.wrapping_sub(1))
 }

@ -139,7 +131,7 @@ pub fn _blsr_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _tzcnt_u16(x: u16) -> u16 {
+pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
    x.trailing_zeros() as u16
 }

@ -149,7 +141,7 @@ pub fn _tzcnt_u16(x: u16) -> u16 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _tzcnt_u32(x: u32) -> u32 {
+pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
    x.trailing_zeros()
 }

@ -159,7 +151,7 @@ pub fn _tzcnt_u32(x: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _tzcnt_u64(x: u64) -> u64 {
+pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
    x.trailing_zeros() as u64
 }

@ -169,7 +161,7 @@ pub fn _tzcnt_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _mm_tzcnt_u32(x: u32) -> u32 {
+pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
    x.trailing_zeros()
 }

@ -179,10 +171,18 @@ pub fn _mm_tzcnt_u32(x: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub fn _mm_tzcnt_u64(x: u64) -> u64 {
+pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
    x.trailing_zeros() as u64
 }

+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.x86.bmi.bextr.32"]
+    fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
+    #[link_name="llvm.x86.bmi.bextr.64"]
+    fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
+}
+
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
@ -191,98 +191,122 @@ mod tests {

    #[simd_test = "bmi"]
    fn _bextr_u32() {
-        assert_eq!(bmi::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+        let r = unsafe { bmi::_bextr_u32(0b0101_0000u32, 4, 4) };
+        assert_eq!(r, 0b0000_0101u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _bextr_u64() {
-        assert_eq!(bmi::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+        let r = unsafe { bmi::_bextr_u64(0b0101_0000u64, 4, 4) };
+        assert_eq!(r, 0b0000_0101u64);
    }

    #[simd_test = "bmi"]
    fn _andn_u32() {
-        assert_eq!(bmi::_andn_u32(0, 0), 0);
-        assert_eq!(bmi::_andn_u32(0, 1), 1);
-        assert_eq!(bmi::_andn_u32(1, 0), 0);
-        assert_eq!(bmi::_andn_u32(1, 1), 0);
+        assert_eq!(unsafe { bmi::_andn_u32(0, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u32(0, 1) }, 1);
+        assert_eq!(unsafe { bmi::_andn_u32(1, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u32(1, 1) }, 0);

-        assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32), 0b0000_0000u32);
-        assert_eq!(bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32), 0b1111_1111u32);
-        assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32), 0b0000_0000u32);
-        assert_eq!(bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32), 0b0000_0000u32);
-        assert_eq!(bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32), 0b0001_1101u32);
+        let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32) };
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = unsafe { bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32) };
+        assert_eq!(r, 0b1111_1111u32);
+
+        let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32) };
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = unsafe { bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32) };
+        assert_eq!(r, 0b0000_0000u32);
+
+        let r = unsafe { bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32) };
+        assert_eq!(r, 0b0001_1101u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _andn_u64() {
-        assert_eq!(bmi::_andn_u64(0, 0), 0);
-        assert_eq!(bmi::_andn_u64(0, 1), 1);
-        assert_eq!(bmi::_andn_u64(1, 0), 0);
-        assert_eq!(bmi::_andn_u64(1, 1), 0);
+        assert_eq!(unsafe { bmi::_andn_u64(0, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u64(0, 1) }, 1);
+        assert_eq!(unsafe { bmi::_andn_u64(1, 0) }, 0);
+        assert_eq!(unsafe { bmi::_andn_u64(1, 1) }, 0);

-        assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64), 0b0000_0000u64);
-        assert_eq!(bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64), 0b1111_1111u64);
-        assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64), 0b0000_0000u64);
-        assert_eq!(bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64), 0b0000_0000u64);
-        assert_eq!(bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64), 0b0001_1101u64);
+        let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b0000_0000u64) };
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = unsafe { bmi::_andn_u64(0b0000_0000u64, 0b1111_1111u64) };
+        assert_eq!(r, 0b1111_1111u64);
+
+        let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b0000_0000u64) };
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = unsafe { bmi::_andn_u64(0b1111_1111u64, 0b1111_1111u64) };
+        assert_eq!(r, 0b0000_0000u64);
+
+        let r = unsafe { bmi::_andn_u64(0b0100_0000u64, 0b0101_1101u64) };
+        assert_eq!(r, 0b0001_1101u64);
    }

    #[simd_test = "bmi"]
    fn _blsi_u32() {
-        assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+        assert_eq!(unsafe { bmi::_blsi_u32(0b1101_0000u32) }, 0b0001_0000u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsi_u64() {
-        assert_eq!(bmi::_blsi_u64(0b1101_0000u64), 0b0001_0000u64);
+        assert_eq!(unsafe { bmi::_blsi_u64(0b1101_0000u64) }, 0b0001_0000u64);
    }

    #[simd_test = "bmi"]
    fn _blsmsk_u32() {
-        assert_eq!(bmi::_blsmsk_u32(0b0011_0000u32), 0b0001_1111u32);
+        let r = unsafe { bmi::_blsmsk_u32(0b0011_0000u32) };
+        assert_eq!(r, 0b0001_1111u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsmsk_u64() {
-        assert_eq!(bmi::_blsmsk_u64(0b0011_0000u64), 0b0001_1111u64);
+        let r = unsafe { bmi::_blsmsk_u64(0b0011_0000u64) };
+        assert_eq!(r, 0b0001_1111u64);
    }

    #[simd_test = "bmi"]
    fn _blsr_u32() {
-        /// TODO: test the behavior when the input is 0
-        assert_eq!(bmi::_blsr_u32(0b0011_0000u32), 0b0010_0000u32);
+        // TODO: test the behavior when the input is 0
+        let r = unsafe { bmi::_blsr_u32(0b0011_0000u32) };
+        assert_eq!(r, 0b0010_0000u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsr_u64() {
-        /// TODO: test the behavior when the input is 0
-        assert_eq!(bmi::_blsr_u64(0b0011_0000u64), 0b0010_0000u64);
+        // TODO: test the behavior when the input is 0
+        let r = unsafe { bmi::_blsr_u64(0b0011_0000u64) };
+        assert_eq!(r, 0b0010_0000u64);
    }

    #[simd_test = "bmi"]
    fn _tzcnt_u16() {
-        assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
-        assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
-        assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
+        assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0001u16) }, 0u16);
+        assert_eq!(unsafe { bmi::_tzcnt_u16(0b0000_0000u16) }, 16u16);
+        assert_eq!(unsafe { bmi::_tzcnt_u16(0b1001_0000u16) }, 4u16);
    }

    #[simd_test = "bmi"]
    fn _tzcnt_u32() {
-        assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
-        assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
-        assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
+        assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0001u32) }, 0u32);
+        assert_eq!(unsafe { bmi::_tzcnt_u32(0b0000_0000u32) }, 32u32);
+        assert_eq!(unsafe { bmi::_tzcnt_u32(0b1001_0000u32) }, 4u32);
    }

    #[simd_test = "bmi"]
    #[cfg(not(target_arch = "x86"))]
    fn _tzcnt_u64() {
-        assert_eq!(bmi::_tzcnt_u64(0b0000_0001u64), 0u64);
-        assert_eq!(bmi::_tzcnt_u64(0b0000_0000u64), 64u64);
-        assert_eq!(bmi::_tzcnt_u64(0b1001_0000u64), 4u64);
+        assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0001u64) }, 0u64);
+        assert_eq!(unsafe { bmi::_tzcnt_u64(0b0000_0000u64) }, 64u64);
+        assert_eq!(unsafe { bmi::_tzcnt_u64(0b1001_0000u64) }, 4u64);
    }
 }
--- a/library/stdarch/src/x86/bmi2.rs
+++ b/library/stdarch/src/x86/bmi2.rs
@ -19,7 +19,7 @@ use stdsimd_test::assert_instr;
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
 #[target_feature = "+bmi2"]
-pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
+pub unsafe fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
    let result: u64 = (a as u64) * (b as u64);
    let hi = (result >> 32) as u32;
    (result as u32, hi)
@ -33,12 +33,67 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
 #[cfg_attr(test, assert_instr(mulx))]
 #[target_feature = "+bmi2"]
 #[cfg(not(target_arch = "x86"))] // calls an intrinsic
-pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
+pub unsafe fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
    let result: u128 = (a as u128) * (b as u128);
    let hi = (result >> 64) as u64;
    (result as u64, hi)
 }

+/// Zero higher bits of `a` >= `index`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(bzhi))]
+pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
+    x86_bmi2_bzhi_32(a, index)
+}
+
+/// Zero higher bits of `a` >= `index`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(bzhi))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 {
+    x86_bmi2_bzhi_64(a, index)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pdep))]
+pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pdep_32(a, mask)
+}
+
+/// Scatter contiguous low order bits of `a` to the result at the positions
+/// specified by the `mask`.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pdep))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pdep_64(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pext))]
+pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
+    x86_bmi2_pext_32(a, mask)
+}
+
+/// Gathers the bits of `x` specified by the `mask` into the contiguous low
+/// order bit positions of the result.
+#[inline(always)]
+#[target_feature = "+bmi2"]
+#[cfg_attr(test, assert_instr(pext))]
+#[cfg(not(target_arch = "x86"))]
+pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
+    x86_bmi2_pext_64(a, mask)
+}
+
 #[allow(dead_code)]
 extern "C" {
    #[link_name="llvm.x86.bmi.bzhi.32"]
@ -55,63 +110,6 @@ extern "C" {
    fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
 }

-
-/// Zero higher bits of `a` >= `index`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(bzhi))]
-pub fn _bzhi_u32(a: u32, index: u32) -> u32 {
-    unsafe { x86_bmi2_bzhi_32(a, index) }
-}
-
-/// Zero higher bits of `a` >= `index`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(bzhi))]
-#[cfg(not(target_arch = "x86"))]
-pub fn _bzhi_u64(a: u64, index: u64) -> u64 {
-    unsafe { x86_bmi2_bzhi_64(a, index) }
-}
-
-
-/// Scatter contiguous low order bits of `a` to the result at the positions
-/// specified by the `mask`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pdep))]
-pub fn _pdep_u32(a: u32, mask: u32) -> u32 {
-    unsafe { x86_bmi2_pdep_32(a, mask) }
-}
-
-/// Scatter contiguous low order bits of `a` to the result at the positions
-/// specified by the `mask`.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pdep))]
-#[cfg(not(target_arch = "x86"))]
-pub fn _pdep_u64(a: u64, mask: u64) -> u64 {
-    unsafe { x86_bmi2_pdep_64(a, mask) }
-}
-
-/// Gathers the bits of `x` specified by the `mask` into the contiguous low
-/// order bit positions of the result.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pext))]
-pub fn _pext_u32(a: u32, mask: u32) -> u32 {
-    unsafe { x86_bmi2_pext_32(a, mask) }
-}
-
-/// Gathers the bits of `x` specified by the `mask` into the contiguous low
-/// order bit positions of the result.
-#[inline(always)]
-#[target_feature = "+bmi2"]
-#[cfg_attr(test, assert_instr(pext))]
-#[cfg(not(target_arch = "x86"))]
-pub fn _pext_u64(a: u64, mask: u64) -> u64 {
-    unsafe { x86_bmi2_pext_64(a, mask) }
-}
-
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
@ -128,8 +126,8 @@ mod tests {
        let m1 = 0b1110_1011_1110_1111u32;
        let s1 = 0b0001_0111_0100_0011u32;

-        assert_eq!(bmi2::_pext_u32(n, m0), s0);
-        assert_eq!(bmi2::_pext_u32(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pext_u32(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pext_u32(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
@ -143,8 +141,8 @@ mod tests {
        let m1 = 0b1110_1011_1110_1111u64;
        let s1 = 0b0001_0111_0100_0011u64;

-        assert_eq!(bmi2::_pext_u64(n, m0), s0);
-        assert_eq!(bmi2::_pext_u64(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pext_u64(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pext_u64(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
@ -157,8 +155,8 @@ mod tests {
        let m1 = 0b1110_1011_1110_1111u32;
        let s1 = 0b1110_1001_0010_0011u32;

-        assert_eq!(bmi2::_pdep_u32(n, m0), s0);
-        assert_eq!(bmi2::_pdep_u32(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pdep_u32(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pdep_u32(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
@ -172,15 +170,15 @@ mod tests {
        let m1 = 0b1110_1011_1110_1111u64;
        let s1 = 0b1110_1001_0010_0011u64;

-        assert_eq!(bmi2::_pdep_u64(n, m0), s0);
-        assert_eq!(bmi2::_pdep_u64(n, m1), s1);
+        assert_eq!(unsafe { bmi2::_pdep_u64(n, m0) }, s0);
+        assert_eq!(unsafe { bmi2::_pdep_u64(n, m1) }, s1);
    }

    #[simd_test = "bmi2"]
    fn _bzhi_u32() {
        let n = 0b1111_0010u32;
        let s = 0b0001_0010u32;
-        assert_eq!(bmi2::_bzhi_u32(n, 5), s);
+        assert_eq!(unsafe { bmi2::_bzhi_u32(n, 5) }, s);
    }

    #[simd_test = "bmi2"]
@ -188,14 +186,14 @@ mod tests {
    fn _bzhi_u64() {
        let n = 0b1111_0010u64;
        let s = 0b0001_0010u64;
-        assert_eq!(bmi2::_bzhi_u64(n, 5), s);
+        assert_eq!(unsafe { bmi2::_bzhi_u64(n, 5) }, s);
    }

    #[simd_test = "bmi2"]
    fn _mulx_u32() {
        let a: u32 = 4_294_967_200;
        let b: u32 = 2;
-        let (lo, hi): (u32, u32)  = bmi2::_mulx_u32(a, b);
+        let (lo, hi): (u32, u32) = unsafe { bmi2::_mulx_u32(a, b) };
        // result = 8589934400
        //        = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
        //            ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -208,7 +206,7 @@ mod tests {
    fn _mulx_u64() {
        let a: u64 = 9_223_372_036_854_775_800;
        let b: u64 = 100;
-        let (lo, hi): (u64, u64)  = bmi2::_mulx_u64(a, b);
+        let (lo, hi): (u64, u64) = unsafe { bmi2::_mulx_u64(a, b) };
        // result = 922337203685477580000
        //        = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
        //            ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/library/stdarch/src/x86/sse.rs
+++ b/library/stdarch/src/x86/sse.rs
@ -9,15 +9,15 @@ use stdsimd_test::assert_instr;
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(addss))]
-pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { addss(a, b) }
+pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
+    addss(a, b)
 }

 /// Adds f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(addps))]
-pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
    a + b
 }

@ -26,15 +26,15 @@ pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(subss))]
-pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { subss(a, b) }
+pub unsafe fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
+    subss(a, b)
 }

 /// Subtracts f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(subps))]
-pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
    a - b
 }

@ -43,15 +43,15 @@ pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(mulss))]
-pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { mulss(a, b) }
+pub unsafe fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
+    mulss(a, b)
 }

 /// Multiplies f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(mulps))]
-pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
    a * b
 }

@ -60,15 +60,15 @@ pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(divss))]
-pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { divss(a, b) }
+pub unsafe fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
+    divss(a, b)
 }

 /// Divides f32x4 vectors.
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(divps))]
-pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
    a / b
 }

@ -77,8 +77,8 @@ pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(sqrtss))]
-pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
-    unsafe { sqrtss(a) }
+pub unsafe fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
+    sqrtss(a)
 }

 /// Return the square root of packed single-precision (32-bit) floating-point
@ -86,8 +86,8 @@ pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(sqrtps))]
-pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
-    unsafe { sqrtps(a) }
+pub unsafe fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
+    sqrtps(a)
 }

 /// Return the approximate reciprocal of the first single-precision
@ -95,8 +95,8 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rcpss))]
-pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
-    unsafe { rcpss(a) }
+pub unsafe fn _mm_rcp_ss(a: f32x4) -> f32x4 {
+    rcpss(a)
 }

 /// Return the approximate reciprocal of packed single-precision (32-bit)
@ -104,8 +104,8 @@ pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rcpps))]
-pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
-    unsafe { rcpps(a) }
+pub unsafe fn _mm_rcp_ps(a: f32x4) -> f32x4 {
+    rcpps(a)
 }

 /// Return the approximate reciprocal square root of the fist single-precision
@ -113,8 +113,8 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rsqrtss))]
-pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
-    unsafe { rsqrtss(a) }
+pub unsafe fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
+    rsqrtss(a)
 }

 /// Return the approximate reciprocal square root of packed single-precision
@ -122,8 +122,8 @@ pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(rsqrtps))]
-pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
-    unsafe { rsqrtps(a) }
+pub unsafe fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
+    rsqrtps(a)
 }

 /// Compare the first single-precision (32-bit) floating-point element of `a`
@ -132,8 +132,8 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(minss))]
-pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { minss(a, b) }
+pub unsafe fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
+    minss(a, b)
 }

 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
@ -141,8 +141,8 @@ pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(minps))]
-pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { minps(a, b) }
+pub unsafe fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
+    minps(a, b)
 }

 /// Compare the first single-precision (32-bit) floating-point element of `a`
@ -151,8 +151,8 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(maxss))]
-pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { maxss(a, b) }
+pub unsafe fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
+    maxss(a, b)
 }

 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
@ -160,24 +160,23 @@ pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(maxps))]
-pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { maxps(a, b) }
+pub unsafe fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
+    maxps(a, b)
 }

-// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
-// using `mask`.
-// The lower half of result takes values from `a` and the higher half from `b`.
-// Mask is split to 2 control bits each to index the element from inputs.
+/// Shuffle packed single-precision (32-bit) floating-point elements in `a` and
+/// `b` using `mask`.
+///
+/// The lower half of result takes values from `a` and the higher half from
+/// `b`. Mask is split to 2 control bits each to index the element from inputs.
 #[inline(always)]
 #[target_feature = "+sse"]
-pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
+pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
    let mask = (mask & 0xFF) as u8;

    macro_rules! shuffle_done {
        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            unsafe {
-                simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
-            }
+            simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
        }
    }
    macro_rules! shuffle_x67 {
@ -219,10 +218,10 @@ pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
 }

 #[cfg(test)]
-#[cfg_attr(test, assert_instr(shufps))]
 #[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(shufps))]
 fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
-    _mm_shuffle_ps(a, b, 3)
+    unsafe { _mm_shuffle_ps(a, b, 3) }
 }

 /// Unpack and interleave single-precision (32-bit) floating-point elements
@ -230,8 +229,8 @@ fn _test_mm_shuffle_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpckhps))]
-pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
+pub unsafe fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
+    simd_shuffle4(a, b, [2, 6, 3, 7])
 }

 /// Unpack and interleave single-precision (32-bit) floating-point elements
@ -239,8 +238,8 @@ pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpcklps))]
-pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
+pub unsafe fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
+    simd_shuffle4(a, b, [0, 4, 1, 5])
 }

 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
@ -249,9 +248,9 @@ pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[target_feature = "+sse"]
 #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
 #[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
-pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
    // TODO; figure why this is a different instruction on Windows?
-    unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
+    simd_shuffle4(a, b, [6, 7, 2, 3])
 }

 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
@ -259,8 +258,8 @@ pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpcklpd))]
-pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
-    unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
+pub unsafe fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
+    simd_shuffle4(a, b, [0, 1, 4, 5])
 }

 /// Return a mask of the most significant bit of each element in `a`.
@ -270,8 +269,8 @@ pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(movmskps))]
-pub fn _mm_movemask_ps(a: f32x4) -> i32 {
-    unsafe { movmskps(a) }
+pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
+    movmskps(a)
 }

 #[allow(improper_ctypes)]
@ -318,7 +317,7 @@ mod tests {
    fn _mm_add_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_add_ps(a, b);
+        let r = unsafe { sse::_mm_add_ps(a, b) };
        assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
    }

@ -326,7 +325,7 @@ mod tests {
    fn _mm_add_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_add_ss(a, b);
+        let r = unsafe { sse::_mm_add_ss(a, b) };
        assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
    }

@ -334,7 +333,7 @@ mod tests {
    fn _mm_sub_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_sub_ps(a, b);
+        let r = unsafe { sse::_mm_sub_ps(a, b) };
        assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
    }

@ -342,7 +341,7 @@ mod tests {
    fn _mm_sub_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_sub_ss(a, b);
+        let r = unsafe { sse::_mm_sub_ss(a, b) };
        assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
    }

@ -350,7 +349,7 @@ mod tests {
    fn _mm_mul_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_mul_ps(a, b);
+        let r = unsafe { sse::_mm_mul_ps(a, b) };
        assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
    }

@ -358,7 +357,7 @@ mod tests {
    fn _mm_mul_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_mul_ss(a, b);
+        let r = unsafe { sse::_mm_mul_ss(a, b) };
        assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
    }

@ -366,7 +365,7 @@ mod tests {
    fn _mm_div_ps() {
        let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
-        let r = sse::_mm_div_ps(a, b);
+        let r = unsafe { sse::_mm_div_ps(a, b) };
        assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
    }

@ -374,14 +373,14 @@ mod tests {
    fn _mm_div_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_div_ss(a, b);
+        let r = unsafe { sse::_mm_div_ss(a, b) };
        assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
    }

    #[simd_test = "sse"]
    fn _mm_sqrt_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_sqrt_ss(a);
+        let r = unsafe { sse::_mm_sqrt_ss(a) };
        let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
        assert_eq!(r, e);
    }
@ -389,7 +388,7 @@ mod tests {
    #[simd_test = "sse"]
    fn _mm_sqrt_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_sqrt_ps(a);
+        let r = unsafe { sse::_mm_sqrt_ps(a) };
        let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
        assert_eq!(r, e);
    }
@ -397,7 +396,7 @@ mod tests {
    #[simd_test = "sse"]
    fn _mm_rcp_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rcp_ss(a);
+        let r = unsafe { sse::_mm_rcp_ss(a) };
        let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
        assert_eq!(r, e);
    }
@ -405,7 +404,7 @@ mod tests {
    #[simd_test = "sse"]
    fn _mm_rcp_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rcp_ps(a);
+        let r = unsafe { sse::_mm_rcp_ps(a) };
        let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
        assert_eq!(r, e);
    }
@ -413,7 +412,7 @@ mod tests {
    #[simd_test = "sse"]
    fn _mm_rsqrt_ss() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rsqrt_ss(a);
+        let r = unsafe { sse::_mm_rsqrt_ss(a) };
        let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
        assert_eq!(r, e);
    }
@ -421,7 +420,7 @@ mod tests {
    #[simd_test = "sse"]
    fn _mm_rsqrt_ps() {
        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
-        let r = sse::_mm_rsqrt_ps(a);
+        let r = unsafe { sse::_mm_rsqrt_ps(a) };
        let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
        assert_eq!(r, e);
    }
@ -430,7 +429,7 @@ mod tests {
    fn _mm_min_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_min_ss(a, b);
+        let r = unsafe { sse::_mm_min_ss(a, b) };
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

@ -438,7 +437,7 @@ mod tests {
    fn _mm_min_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_min_ps(a, b);
+        let r = unsafe { sse::_mm_min_ps(a, b) };
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

@ -446,7 +445,7 @@ mod tests {
    fn _mm_max_ss() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_max_ss(a, b);
+        let r = unsafe { sse::_mm_max_ss(a, b) };
        assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
    }

@ -454,7 +453,7 @@ mod tests {
    fn _mm_max_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_max_ps(a, b);
+        let r = unsafe { sse::_mm_max_ps(a, b) };
        assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
    }

@ -463,7 +462,7 @@ mod tests {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
        let mask = 0b00_01_01_11;
-        let r = sse::_mm_shuffle_ps(a, b, mask);
+        let r = unsafe { sse::_mm_shuffle_ps(a, b, mask) };
        assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
    }

@ -471,7 +470,7 @@ mod tests {
    fn _mm_unpackhi_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_unpackhi_ps(a, b);
+        let r = unsafe { sse::_mm_unpackhi_ps(a, b) };
        assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
    }

@ -479,7 +478,7 @@ mod tests {
    fn _mm_unpacklo_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_unpacklo_ps(a, b);
+        let r = unsafe { sse::_mm_unpacklo_ps(a, b) };
        assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
    }

@ -487,7 +486,7 @@ mod tests {
    fn _mm_movehl_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_movehl_ps(a, b);
+        let r = unsafe { sse::_mm_movehl_ps(a, b) };
        assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
    }

@ -495,16 +494,20 @@ mod tests {
    fn _mm_movelh_ps() {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
-        let r = sse::_mm_movelh_ps(a, b);
+        let r = unsafe { sse::_mm_movelh_ps(a, b) };
        assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
    }

    #[simd_test = "sse"]
    fn _mm_movemask_ps() {
-        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
+        let r = unsafe {
+            sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0))
+        };
        assert_eq!(r, 0b0101);

-        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
+        let r = unsafe {
+            sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0))
+        };
        assert_eq!(r, 0b0111);
    }
 }
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
--- a/library/stdarch/src/x86/sse41.rs
+++ b/library/stdarch/src/x86/sse41.rs
@ -1,18 +1,18 @@
-use v128::*;
-use x86::__m128i;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use v128::*;
+use x86::__m128i;
+
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendvb))]
-pub fn _mm_blendv_epi8(
+pub unsafe fn _mm_blendv_epi8(
    a: __m128i,
    b: __m128i,
    mask: __m128i,
 ) -> __m128i {
-    unsafe { pblendvb(a, b, mask) }
+    pblendvb(a, b, mask)
 }

 /// Returns the dot product of two f64x2 vectors.
@ -24,15 +24,20 @@ pub fn _mm_blendv_epi8(
 /// the broadcast mask bit is zero then the return component will be zero.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { dppd(a, b, $imm8) }
-        }
+        ($imm8:expr) => { dppd(a, b, $imm8) }
    }
    constify_imm8!(imm8, call)
 }

+#[cfg(test)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(dppd))]
+fn _test_mm_dp_pd(a: f64x2, b: f64x2) -> f64x2 {
+    unsafe { _mm_dp_pd(a, b, 0) }
+}
+
 /// Returns the dot product of two f32x4 vectors.
 ///
 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
@ -42,15 +47,20 @@ pub fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 /// the broadcast mask bit is zero then the return component will be zero.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-pub fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { dpps(a, b, $imm8) }
-        }
+        ($imm8:expr) => { dpps(a, b, $imm8) }
    }
    constify_imm8!(imm8, call)
 }

+#[cfg(test)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(dpps))]
+fn _test_mm_dp_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { _mm_dp_ps(a, b, 0) }
+}
+
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse41.pblendvb"]
@ -78,7 +88,7 @@ mod tests {
            0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
        let e = i8x16::new(
            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
-        assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
+        assert_eq!(unsafe { sse41::_mm_blendv_epi8(a, b, mask) }, e);
    }

    #[simd_test = "sse4.1"]
@ -86,7 +96,7 @@ mod tests {
        let a = f64x2::new(2.0, 3.0);
        let b = f64x2::new(1.0, 4.0);
        let e = f64x2::new(14.0, 0.0);
-        assert_eq!(sse41::_mm_dp_pd(a, b, 0b00110001), e);
+        assert_eq!(unsafe { sse41::_mm_dp_pd(a, b, 0b00110001) }, e);
    }

    #[simd_test = "sse4.1"]
@ -94,6 +104,6 @@ mod tests {
        let a = f32x4::new(2.0, 3.0, 1.0, 10.0);
        let b = f32x4::new(1.0, 4.0, 0.5, 10.0);
        let e = f32x4::new(14.5, 0.0, 14.5, 0.0);
-        assert_eq!(sse41::_mm_dp_ps(a, b, 0b01110101), e);
+        assert_eq!(unsafe { sse41::_mm_dp_ps(a, b, 0b01110101) }, e);
    }
 }
--- a/library/stdarch/src/x86/sse42.rs
+++ b/library/stdarch/src/x86/sse42.rs
@ -1,3 +1,6 @@
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 use x86::__m128i;

 pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;
@ -19,7 +22,7 @@ pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b01000000;

 #[inline(always)]
 #[target_feature = "+sse4.2"]
-pub fn _mm_cmpestri(
+pub unsafe fn _mm_cmpestri(
    a: __m128i,
    la: i32,
    b: __m128i,
@ -27,13 +30,18 @@ pub fn _mm_cmpestri(
    imm8: i8,
 ) -> i32 {
    macro_rules! call {
-        ($imm8:expr) => {
-            unsafe { pcmpestri128(a, la, b, lb, $imm8) }
-        }
+        ($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
    }
    constify_imm8!(imm8, call)
 }

+#[cfg(test)]
+#[target_feature = "+sse4.2"]
+#[cfg_attr(test, assert_instr(pcmpestri))]
+fn _test_mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32 {
+    unsafe { _mm_cmpestri(a, la, b, lb, 0) }
+}
+
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse42.pcmpestri128"]
@ -53,8 +61,10 @@ mod tests {
        let b = &b"foobar          "[..];
        let va = __m128i::from(u8x16::load(a, 0));
        let vb = __m128i::from(u8x16::load(b, 0));
-        let i = sse42::_mm_cmpestri(
-            va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
+        let i = unsafe {
+            sse42::_mm_cmpestri(
+                va, 3, vb, 6, sse42::_SIDD_CMP_EQUAL_ORDERED)
+        };
        assert_eq!(3, i);
    }
 }
--- a/library/stdarch/src/x86/ssse3.rs
+++ b/library/stdarch/src/x86/ssse3.rs
@ -1,15 +1,15 @@
-use v128::*;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use v128::*;
+
 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pabsb))]
-pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
-    unsafe { pabsb128(a) }
+pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
+    pabsb128(a)
 }

 /// Shuffle bytes from `a` according to the content of `b`.
@ -39,8 +39,8 @@ pub fn _mm_abs_epi8(a: i8x16) -> u8x16 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pshufb))]
-pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
-    unsafe { pshufb128(a, b) }
+pub unsafe fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
+    pshufb128(a, b)
 }


@ -48,7 +48,6 @@ pub fn _mm_shuffle_epi8(a: u8x16, b: u8x16) -> u8x16 {
 extern {
    #[link_name = "llvm.x86.ssse3.pabs.b.128"]
    fn pabsb128(a: i8x16) -> u8x16;
-
    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
 }
@ -62,16 +61,31 @@ mod tests {

    #[simd_test = "ssse3"]
    fn _mm_abs_epi8() {
-        let r = ssse3::_mm_abs_epi8(i8x16::splat(-5));
+        let r = unsafe { ssse3::_mm_abs_epi8(i8x16::splat(-5)) };
        assert_eq!(r, u8x16::splat(5));
    }

    #[simd_test = "ssse3"]
    fn _mm_shuffle_epi8() {
-        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b = u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
-        let expected = u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
-        let r = ssse3::_mm_shuffle_epi8(a, b);
+        let a = u8x16::new(
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        let b = u8x16::new(
+            4, 128, 4, 3,
+            24, 12, 6, 19,
+            12, 5, 5, 10,
+            4, 1, 8, 0,
+        );
+        let expected = u8x16::new(
+            5, 0, 5, 4,
+            9, 13, 7, 4,
+            13, 6, 6, 11,
+            5, 2, 9, 1,
+        );
+        let r = unsafe { ssse3::_mm_shuffle_epi8(a, b) };
        assert_eq!(r, expected);
    }
 }
--- a/library/stdarch/src/x86/tbm.rs
+++ b/library/stdarch/src/x86/tbm.rs
@ -65,7 +65,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcfill))]
-pub fn _blcfill_u32(x: u32) -> u32 {
+pub unsafe fn _blcfill_u32(x: u32) -> u32 {
    x & (x.wrapping_add(1))
 }

@ -76,7 +76,7 @@ pub fn _blcfill_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcfill_u64(x: u64) -> u64 {
+pub unsafe fn _blcfill_u64(x: u64) -> u64 {
    x & (x.wrapping_add(1))
 }

@ -86,7 +86,7 @@ pub fn _blcfill_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blci))]
-pub fn _blci_u32(x: u32) -> u32 {
+pub unsafe fn _blci_u32(x: u32) -> u32 {
    x | !(x.wrapping_add(1))
 }

@ -97,7 +97,7 @@ pub fn _blci_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blci))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blci_u64(x: u64) -> u64 {
+pub unsafe fn _blci_u64(x: u64) -> u64 {
    x | !(x.wrapping_add(1))
 }

@ -107,7 +107,7 @@ pub fn _blci_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcic))]
-pub fn _blcic_u32(x: u32) -> u32 {
+pub unsafe fn _blcic_u32(x: u32) -> u32 {
    !x & (x.wrapping_add(1))
 }

@ -118,7 +118,7 @@ pub fn _blcic_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcic_u64(x: u64) -> u64 {
+pub unsafe fn _blcic_u64(x: u64) -> u64 {
    !x & (x.wrapping_add(1))
 }

@ -128,7 +128,7 @@ pub fn _blcic_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcmsk))]
-pub fn _blcmsk_u32(x: u32) -> u32 {
+pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
    x ^ (x.wrapping_add(1))
 }

@ -139,7 +139,7 @@ pub fn _blcmsk_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcmsk_u64(x: u64) -> u64 {
+pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
    x ^ (x.wrapping_add(1))
 }

@ -149,7 +149,7 @@ pub fn _blcmsk_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcs))]
-pub fn _blcs_u32(x: u32) -> u32 {
+pub unsafe fn _blcs_u32(x: u32) -> u32 {
    x | (x.wrapping_add(1))
 }

@ -160,7 +160,7 @@ pub fn _blcs_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blcs))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blcs_u64(x: u64) -> u64 {
+pub unsafe fn _blcs_u64(x: u64) -> u64 {
    x | x.wrapping_add(1)
 }

@ -170,7 +170,7 @@ pub fn _blcs_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsfill))]
-pub fn _blsfill_u32(x: u32) -> u32 {
+pub unsafe fn _blsfill_u32(x: u32) -> u32 {
    x | (x.wrapping_sub(1))
 }

@ -181,7 +181,7 @@ pub fn _blsfill_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsfill_u64(x: u64) -> u64 {
+pub unsafe fn _blsfill_u64(x: u64) -> u64 {
    x | (x.wrapping_sub(1))
 }

@ -191,7 +191,7 @@ pub fn _blsfill_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsic))]
-pub fn _blsic_u32(x: u32) -> u32 {
+pub unsafe fn _blsic_u32(x: u32) -> u32 {
    !x | (x.wrapping_sub(1))
 }

@ -202,7 +202,7 @@ pub fn _blsic_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(blsic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _blsic_u64(x: u64) -> u64 {
+pub unsafe fn _blsic_u64(x: u64) -> u64 {
    !x | (x.wrapping_sub(1))
 }

@ -213,7 +213,7 @@ pub fn _blsic_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(t1mskc))]
-pub fn _t1mskc_u32(x: u32) -> u32 {
+pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
    !x | (x.wrapping_add(1))
 }

@ -225,7 +225,7 @@ pub fn _t1mskc_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _t1mskc_u64(x: u64) -> u64 {
+pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
    !x | (x.wrapping_add(1))
 }

@ -236,7 +236,7 @@ pub fn _t1mskc_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(tzmsk))]
-pub fn _tzmsk_u32(x: u32) -> u32 {
+pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
    !x & (x.wrapping_sub(1))
 }

@ -248,7 +248,7 @@ pub fn _tzmsk_u32(x: u32) -> u32 {
 #[target_feature = "+tbm"]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
-pub fn _tzmsk_u64(x: u64) -> u64 {
+pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
    !x & (x.wrapping_sub(1))
 }

@ -272,122 +272,174 @@ mod tests {

    #[simd_test = "tbm"]
    fn _blcfill_u32() {
-        assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
-        assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u32(0b0101_0111u32) },
+            0b0101_0000u32);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u32(0b1111_1111u32) },
+            0u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcfill_u64() {
-        assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
-        assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u64(0b0101_0111u64) },
+            0b0101_0000u64);
+        assert_eq!(
+            unsafe { tbm::_blcfill_u64(0b1111_1111u64) },
+            0u64);
    }

    #[simd_test = "tbm"]
    fn _blci_u32() {
-        assert_eq!(tbm::_blci_u32(0b0101_0000u32),
-                   0b1111_1111_1111_1111_1111_1111_1111_1110u32);
-        assert_eq!(tbm::_blci_u32(0b1111_1111u32),
-                   0b1111_1111_1111_1111_1111_1110_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blci_u32(0b0101_0000u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1110u32);
+        assert_eq!(
+            unsafe { tbm::_blci_u32(0b1111_1111u32) },
+            0b1111_1111_1111_1111_1111_1110_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blci_u64() {
-        assert_eq!(tbm::_blci_u64(0b0101_0000u64),
-                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
-        assert_eq!(tbm::_blci_u64(0b1111_1111u64),
-                   0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blci_u64(0b0101_0000u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
+        assert_eq!(
+            unsafe { tbm::_blci_u64(0b1111_1111u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blcic_u32() {
-        assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
-        assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+        assert_eq!(
+            unsafe { tbm::_blcic_u32(0b0101_0001u32) },
+            0b0000_0010u32);
+        assert_eq!(
+            unsafe { tbm::_blcic_u32(0b1111_1111u32) },
+            0b1_0000_0000u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcic_u64() {
-        assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
-        assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+        assert_eq!(
+            unsafe { tbm::_blcic_u64(0b0101_0001u64) },
+            0b0000_0010u64);
+        assert_eq!(
+            unsafe { tbm::_blcic_u64(0b1111_1111u64) },
+            0b1_0000_0000u64);
    }

    #[simd_test = "tbm"]
    fn _blcmsk_u32() {
-        assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
-        assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u32(0b0101_0001u32) },
+            0b0000_0011u32);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u32(0b1111_1111u32) },
+            0b1_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcmsk_u64() {
-        assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
-        assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u64(0b0101_0001u64) },
+            0b0000_0011u64);
+        assert_eq!(
+            unsafe { tbm::_blcmsk_u64(0b1111_1111u64) },
+            0b1_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blcs_u32() {
-       assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
-       assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+       assert_eq!(unsafe { tbm::_blcs_u32(0b0101_0001u32) }, 0b0101_0011u32);
+       assert_eq!(unsafe { tbm::_blcs_u32(0b1111_1111u32) }, 0b1_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blcs_u64() {
-       assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
-       assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+       assert_eq!(unsafe { tbm::_blcs_u64(0b0101_0001u64) }, 0b0101_0011u64);
+       assert_eq!(unsafe { tbm::_blcs_u64(0b1111_1111u64) }, 0b1_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blsfill_u32() {
-        assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
-        assert_eq!(tbm::_blsfill_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u32(0b0101_0100u32) },
+            0b0101_0111u32);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u32(0u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsfill_u64() {
-        assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
-        assert_eq!(tbm::_blsfill_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u64(0b0101_0100u64) },
+            0b0101_0111u64);
+        assert_eq!(
+            unsafe { tbm::_blsfill_u64(0u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _blsic_u32() {
-        assert_eq!(tbm::_blsic_u32(0b0101_0100u32), 0b1111_1111_1111_1111_1111_1111_1111_1011u32);
-        assert_eq!(tbm::_blsic_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+        assert_eq!(
+            unsafe { tbm::_blsic_u32(0b0101_0100u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1011u32);
+        assert_eq!(
+            unsafe { tbm::_blsic_u32(0u32) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _blsic_u64() {
-        assert_eq!(tbm::_blsic_u64(0b0101_0100u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
-       assert_eq!(tbm::_blsic_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+        assert_eq!(
+            unsafe { tbm::_blsic_u64(0b0101_0100u64) },
+            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
+       assert_eq!(
+           unsafe { tbm::_blsic_u64(0u64) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _t1mskc_u32() {
-       assert_eq!(tbm::_t1mskc_u32(0b0101_0111u32), 0b1111_1111_1111_1111_1111_1111_1111_1000u32);
-       assert_eq!(tbm::_t1mskc_u32(0u32), 0b1111_1111_1111_1111_1111_1111_1111_1111u32);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u32(0b0101_0111u32) },
+           0b1111_1111_1111_1111_1111_1111_1111_1000u32);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u32(0u32) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _t1mksc_u64() {
-       assert_eq!(tbm::_t1mskc_u64(0b0101_0111u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
-       assert_eq!(tbm::_t1mskc_u64(0u64), 0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u64(0b0101_0111u64) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
+       assert_eq!(
+           unsafe { tbm::_t1mskc_u64(0u64) },
+           0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
    }

    #[simd_test = "tbm"]
    fn _tzmsk_u32() {
-        assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
-        assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+        assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1000u32) }, 0b0000_0111u32);
+        assert_eq!(unsafe { tbm::_tzmsk_u32(0b0101_1001u32) }, 0b0000_0000u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    fn _tzmsk_u64() {
-        assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
-        assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+        assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1000u64) }, 0b0000_0111u64);
+        assert_eq!(unsafe { tbm::_tzmsk_u64(0b0101_1001u64) }, 0b0000_0000u64);
    }
 }