Reorganize the x86/x86_64 intrinsic folders (#334)

The public API isn't changing in this commit but the internal organization is being rejiggered. Instead of `x86/$subtarget/$feature.rs` the folders are changed to `coresimd/x86/$feature.rs` and `coresimd/x86_64/$feature.rs`. The `arch::x86_64` then reexports both the contents of the `x86` module and the `x86_64` module.
2018-02-27 08:41:07 -06:00 · 2018-02-27 08:41:07 -06:00 · 217f89bc4f
commit 217f89bc4f
parent aa4cef7723
49 changed files with 1690 additions and 1807 deletions
--- a/library/stdarch/coresimd/mod.rs
+++ b/library/stdarch/coresimd/mod.rs
@ -48,6 +48,7 @@ pub mod arch {
    #[cfg(target_arch = "x86_64")]
    pub mod x86_64 {
        pub use coresimd::x86::*;
+        pub use coresimd::x86_64::*;
    }

    /// Platform-specific intrinsics for the `arm` platform.
@ -116,6 +117,8 @@ mod v16 {

 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
+#[cfg(target_arch = "x86_64")]
+mod x86_64;

 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
 mod arm;
--- a/library/stdarch/coresimd/x86/i586/abm.rs
+++ b/library/stdarch/coresimd/x86/i586/abm.rs
@ -42,15 +42,15 @@ pub unsafe fn _popcnt32(x: i32) -> i32 {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::i586::abm;
+    use coresimd::x86::*;

    #[simd_test = "lzcnt"]
-    unsafe fn _lzcnt_u32() {
-        assert_eq!(abm::_lzcnt_u32(0b0101_1010), 25);
+    unsafe fn test_lzcnt_u32() {
+        assert_eq!(_lzcnt_u32(0b0101_1010), 25);
    }

    #[simd_test = "popcnt"]
-    unsafe fn _popcnt32() {
-        assert_eq!(abm::_popcnt32(0b0101_1010), 4);
+    unsafe fn test_popcnt32() {
+        assert_eq!(_popcnt32(0b0101_1010), 4);
    }
 }
--- a/library/stdarch/coresimd/x86/i686/aes.rs
+++ b/library/stdarch/coresimd/x86/i686/aes.rs
--- a/library/stdarch/coresimd/x86/i586/avx.rs
+++ b/library/stdarch/coresimd/x86/i586/avx.rs
--- a/library/stdarch/coresimd/x86/i586/avx2.rs
+++ b/library/stdarch/coresimd/x86/i586/avx2.rs
--- a/library/stdarch/coresimd/x86/i586/bmi.rs
+++ b/library/stdarch/coresimd/x86/i586/bmi.rs
@ -96,59 +96,59 @@ extern "C" {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::i586::bmi;
+    use coresimd::x86::*;

    #[simd_test = "bmi"]
-    unsafe fn _bextr_u32() {
-        let r = bmi::_bextr_u32(0b0101_0000u32, 4, 4);
+    unsafe fn test_bextr_u32() {
+        let r = _bextr_u32(0b0101_0000u32, 4, 4);
        assert_eq!(r, 0b0000_0101u32);
    }

    #[simd_test = "bmi"]
-    unsafe fn _andn_u32() {
-        assert_eq!(bmi::_andn_u32(0, 0), 0);
-        assert_eq!(bmi::_andn_u32(0, 1), 1);
-        assert_eq!(bmi::_andn_u32(1, 0), 0);
-        assert_eq!(bmi::_andn_u32(1, 1), 0);
+    unsafe fn test_andn_u32() {
+        assert_eq!(_andn_u32(0, 0), 0);
+        assert_eq!(_andn_u32(0, 1), 1);
+        assert_eq!(_andn_u32(1, 0), 0);
+        assert_eq!(_andn_u32(1, 1), 0);

-        let r = bmi::_andn_u32(0b0000_0000u32, 0b0000_0000u32);
+        let r = _andn_u32(0b0000_0000u32, 0b0000_0000u32);
        assert_eq!(r, 0b0000_0000u32);

-        let r = bmi::_andn_u32(0b0000_0000u32, 0b1111_1111u32);
+        let r = _andn_u32(0b0000_0000u32, 0b1111_1111u32);
        assert_eq!(r, 0b1111_1111u32);

-        let r = bmi::_andn_u32(0b1111_1111u32, 0b0000_0000u32);
+        let r = _andn_u32(0b1111_1111u32, 0b0000_0000u32);
        assert_eq!(r, 0b0000_0000u32);

-        let r = bmi::_andn_u32(0b1111_1111u32, 0b1111_1111u32);
+        let r = _andn_u32(0b1111_1111u32, 0b1111_1111u32);
        assert_eq!(r, 0b0000_0000u32);

-        let r = bmi::_andn_u32(0b0100_0000u32, 0b0101_1101u32);
+        let r = _andn_u32(0b0100_0000u32, 0b0101_1101u32);
        assert_eq!(r, 0b0001_1101u32);
    }

    #[simd_test = "bmi"]
-    unsafe fn _blsi_u32() {
-        assert_eq!(bmi::_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
+    unsafe fn test_blsi_u32() {
+        assert_eq!(_blsi_u32(0b1101_0000u32), 0b0001_0000u32);
    }

    #[simd_test = "bmi"]
-    unsafe fn _blsmsk_u32() {
-        let r = bmi::_blsmsk_u32(0b0011_0000u32);
+    unsafe fn test_blsmsk_u32() {
+        let r = _blsmsk_u32(0b0011_0000u32);
        assert_eq!(r, 0b0001_1111u32);
    }

    #[simd_test = "bmi"]
-    unsafe fn _blsr_u32() {
+    unsafe fn test_blsr_u32() {
        // TODO: test the behavior when the input is 0
-        let r = bmi::_blsr_u32(0b0011_0000u32);
+        let r = _blsr_u32(0b0011_0000u32);
        assert_eq!(r, 0b0010_0000u32);
    }

    #[simd_test = "bmi"]
-    unsafe fn _tzcnt_u32() {
-        assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
-        assert_eq!(bmi::_tzcnt_u32(0b0000_0000u32), 32u32);
-        assert_eq!(bmi::_tzcnt_u32(0b1001_0000u32), 4u32);
+    unsafe fn test_tzcnt_u32() {
+        assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);
+        assert_eq!(_tzcnt_u32(0b0000_0000u32), 32u32);
+        assert_eq!(_tzcnt_u32(0b1001_0000u32), 4u32);
    }
 }
--- a/library/stdarch/coresimd/x86/i586/bmi2.rs
+++ b/library/stdarch/coresimd/x86/i586/bmi2.rs
@ -67,10 +67,10 @@ extern "C" {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::i586::bmi2;
+    use coresimd::x86::*;

    #[simd_test = "bmi2"]
-    unsafe fn _pext_u32() {
+    unsafe fn test_pext_u32() {
        let n = 0b1011_1110_1001_0011u32;

        let m0 = 0b0110_0011_1000_0101u32;
@ -79,12 +79,12 @@ mod tests {
        let m1 = 0b1110_1011_1110_1111u32;
        let s1 = 0b0001_0111_0100_0011u32;

-        assert_eq!(bmi2::_pext_u32(n, m0), s0);
-        assert_eq!(bmi2::_pext_u32(n, m1), s1);
+        assert_eq!(_pext_u32(n, m0), s0);
+        assert_eq!(_pext_u32(n, m1), s1);
    }

    #[simd_test = "bmi2"]
-    unsafe fn _pdep_u32() {
+    unsafe fn test_pdep_u32() {
        let n = 0b1011_1110_1001_0011u32;

        let m0 = 0b0110_0011_1000_0101u32;
@ -93,23 +93,23 @@ mod tests {
        let m1 = 0b1110_1011_1110_1111u32;
        let s1 = 0b1110_1001_0010_0011u32;

-        assert_eq!(bmi2::_pdep_u32(n, m0), s0);
-        assert_eq!(bmi2::_pdep_u32(n, m1), s1);
+        assert_eq!(_pdep_u32(n, m0), s0);
+        assert_eq!(_pdep_u32(n, m1), s1);
    }

    #[simd_test = "bmi2"]
-    unsafe fn _bzhi_u32() {
+    unsafe fn test_bzhi_u32() {
        let n = 0b1111_0010u32;
        let s = 0b0001_0010u32;
-        assert_eq!(bmi2::_bzhi_u32(n, 5), s);
+        assert_eq!(_bzhi_u32(n, 5), s);
    }

    #[simd_test = "bmi2"]
-    unsafe fn _mulx_u32() {
+    unsafe fn test_mulx_u32() {
        let a: u32 = 4_294_967_200;
        let b: u32 = 2;
        let mut hi = 0;
-        let lo = bmi2::_mulx_u32(a, b, &mut hi);
+        let lo = _mulx_u32(a, b, &mut hi);
        /*
 result = 8589934400
       = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
--- a/library/stdarch/coresimd/x86/i386/bswap.rs
+++ b/library/stdarch/coresimd/x86/i386/bswap.rs
--- a/library/stdarch/coresimd/x86/i586/cpuid.rs
+++ b/library/stdarch/coresimd/x86/i586/cpuid.rs
@ -79,7 +79,7 @@ pub fn has_cpuid() -> bool {
    }
    #[cfg(target_arch = "x86")]
    {
-        use coresimd::x86::i386::{__readeflags, __writeeflags};
+        use coresimd::x86::{__readeflags, __writeeflags};

        // On `x86` the `cpuid` instruction is not always available.
        // This follows the approach indicated in:
@ -121,7 +121,7 @@ pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {

 #[cfg(test)]
 mod tests {
-    use coresimd::x86::i586::cpuid;
+    use coresimd::x86::*;

    #[test]
    fn test_always_has_cpuid() {
@ -133,7 +133,6 @@ mod tests {
    #[cfg(target_arch = "x86")]
    #[test]
    fn test_has_cpuid() {
-        use coresimd::x86::i386::__readeflags;
        unsafe {
            let before = __readeflags();

--- a/library/stdarch/coresimd/x86/i386/eflags.rs
+++ b/library/stdarch/coresimd/x86/i386/eflags.rs
@ -34,7 +34,7 @@ pub unsafe fn __writeeflags(eflags: u64) {

 #[cfg(test)]
 mod tests {
-    use coresimd::x86::i386::*;
+    use coresimd::x86::*;

    #[test]
    fn test_eflags() {
--- a/library/stdarch/coresimd/x86/i386/fxsr.rs
+++ b/library/stdarch/coresimd/x86/i386/fxsr.rs
--- a/library/stdarch/coresimd/x86/i386/mod.rs
+++ b/library/stdarch/coresimd/x86/i386/mod.rs
@ -1,15 +0,0 @@
-//! `i386` intrinsics
-
-mod eflags;
-pub use self::eflags::*;
-
-#[cfg(dont_compile_me)] // TODO: need to upstream `fxsr` target feature
-mod fxsr;
-#[cfg(dont_compile_me)] // TODO: need to upstream `fxsr` target feature
-pub use self::fxsr::*;
-
-mod bswap;
-pub use self::bswap::*;
-
-mod rdtsc;
-pub use self::rdtsc::*;
--- a/library/stdarch/coresimd/x86/i586/mod.rs
+++ b/library/stdarch/coresimd/x86/i586/mod.rs
@ -1,39 +0,0 @@
-//! `i586` intrinsics
-
-pub use self::cpuid::*;
-pub use self::xsave::*;
-
-pub use self::sse::*;
-pub use self::sse2::*;
-pub use self::sse3::*;
-pub use self::ssse3::*;
-pub use self::sse41::*;
-pub use self::sse42::*;
-pub use self::avx::*;
-pub use self::avx2::*;
-
-pub use self::abm::*;
-pub use self::bmi::*;
-pub use self::bmi2::*;
-
-#[cfg(not(feature = "intel_sde"))]
-pub use self::tbm::*;
-
-mod cpuid;
-mod xsave;
-
-mod sse;
-mod sse2;
-mod sse3;
-mod ssse3;
-mod sse41;
-mod sse42;
-mod avx;
-mod avx2;
-
-mod abm;
-mod bmi;
-mod bmi2;
-
-#[cfg(not(feature = "intel_sde"))]
-mod tbm;
--- a/library/stdarch/coresimd/x86/i686/mod.rs
+++ b/library/stdarch/coresimd/x86/i686/mod.rs
@ -1,33 +0,0 @@
-//! `i686` intrinsics
-
-mod aes;
-pub use self::aes::*;
-
-mod rdrand;
-pub use self::rdrand::*;
-
-mod mmx;
-pub use self::mmx::*;
-
-mod pclmulqdq;
-pub use self::pclmulqdq::*;
-
-mod sse;
-pub use self::sse::*;
-
-mod sse2;
-pub use self::sse2::*;
-
-mod ssse3;
-pub use self::ssse3::*;
-
-mod sse41;
-pub use self::sse41::*;
-
-mod sse42;
-pub use self::sse42::*;
-
-#[cfg(not(feature = "intel_sde"))]
-mod sse4a;
-#[cfg(not(feature = "intel_sde"))]
-pub use self::sse4a::*;
--- a/library/stdarch/coresimd/x86/i686/sse.rs
+++ b/library/stdarch/coresimd/x86/i686/sse.rs
@ -1,710 +0,0 @@
-//! `i686` Streaming SIMD Extensions (SSE)
-
-use coresimd::x86::*;
-
-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.sse.cvtpi2ps"]
-    fn cvtpi2ps(a: __m128, b: __m64) -> __m128;
-    #[link_name = "llvm.x86.mmx.maskmovq"]
-    fn maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8);
-    #[link_name = "llvm.x86.mmx.pextr.w"]
-    fn pextrw(a: __m64, imm8: i32) -> i32;
-    #[link_name = "llvm.x86.mmx.pinsr.w"]
-    fn pinsrw(a: __m64, d: i32, imm8: i32) -> __m64;
-    #[link_name = "llvm.x86.mmx.pmovmskb"]
-    fn pmovmskb(a: __m64) -> i32;
-    #[link_name = "llvm.x86.sse.pshuf.w"]
-    fn pshufw(a: __m64, imm8: i8) -> __m64;
-    #[link_name = "llvm.x86.mmx.pmaxs.w"]
-    fn pmaxsw(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pmaxu.b"]
-    fn pmaxub(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pmins.w"]
-    fn pminsw(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pminu.b"]
-    fn pminub(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pmulhu.w"]
-    fn pmulhuw(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pavg.b"]
-    fn pavgb(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pavg.w"]
-    fn pavgw(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.psad.bw"]
-    fn psadbw(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.sse.cvtps2pi"]
-    fn cvtps2pi(a: __m128) -> __m64;
-    #[link_name = "llvm.x86.sse.cvttps2pi"]
-    fn cvttps2pi(a: __m128) -> __m64;
-}
-
-/// Compares the packed 16-bit signed integers of `a` and `b` writing the
-/// greatest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmaxsw))]
-pub unsafe fn _mm_max_pi16(a: __m64, b: __m64) -> __m64 {
-    pmaxsw(a, b)
-}
-
-/// Compares the packed 16-bit signed integers of `a` and `b` writing the
-/// greatest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmaxsw))]
-pub unsafe fn _m_pmaxsw(a: __m64, b: __m64) -> __m64 {
-    _mm_max_pi16(a, b)
-}
-
-/// Compares the packed 8-bit signed integers of `a` and `b` writing the
-/// greatest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmaxub))]
-pub unsafe fn _mm_max_pu8(a: __m64, b: __m64) -> __m64 {
-    pmaxub(a, b)
-}
-
-/// Compares the packed 8-bit signed integers of `a` and `b` writing the
-/// greatest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmaxub))]
-pub unsafe fn _m_pmaxub(a: __m64, b: __m64) -> __m64 {
-    _mm_max_pu8(a, b)
-}
-
-/// Compares the packed 16-bit signed integers of `a` and `b` writing the
-/// smallest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pminsw))]
-pub unsafe fn _mm_min_pi16(a: __m64, b: __m64) -> __m64 {
-    pminsw(a, b)
-}
-
-/// Compares the packed 16-bit signed integers of `a` and `b` writing the
-/// smallest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pminsw))]
-pub unsafe fn _m_pminsw(a: __m64, b: __m64) -> __m64 {
-    _mm_min_pi16(a, b)
-}
-
-/// Compares the packed 8-bit signed integers of `a` and `b` writing the
-/// smallest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pminub))]
-pub unsafe fn _mm_min_pu8(a: __m64, b: __m64) -> __m64 {
-    pminub(a, b)
-}
-
-/// Compares the packed 8-bit signed integers of `a` and `b` writing the
-/// smallest value into the result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pminub))]
-pub unsafe fn _m_pminub(a: __m64, b: __m64) -> __m64 {
-    _mm_min_pu8(a, b)
-}
-
-/// Multiplies packed 16-bit unsigned integer values and writes the
-/// high-order 16 bits of each 32-bit product to the corresponding bits in
-/// the destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmulhuw))]
-pub unsafe fn _mm_mulhi_pu16(a: __m64, b: __m64) -> __m64 {
-    pmulhuw(a, b)
-}
-
-/// Multiplies packed 16-bit unsigned integer values and writes the
-/// high-order 16 bits of each 32-bit product to the corresponding bits in
-/// the destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmulhuw))]
-pub unsafe fn _m_pmulhuw(a: __m64, b: __m64) -> __m64 {
-    _mm_mulhi_pu16(a, b)
-}
-
-/// Computes the rounded averages of the packed unsigned 8-bit integer
-/// values and writes the averages to the corresponding bits in the
-/// destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pavgb))]
-pub unsafe fn _mm_avg_pu8(a: __m64, b: __m64) -> __m64 {
-    pavgb(a, b)
-}
-
-/// Computes the rounded averages of the packed unsigned 8-bit integer
-/// values and writes the averages to the corresponding bits in the
-/// destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pavgb))]
-pub unsafe fn _m_pavgb(a: __m64, b: __m64) -> __m64 {
-    _mm_avg_pu8(a, b)
-}
-
-/// Computes the rounded averages of the packed unsigned 16-bit integer
-/// values and writes the averages to the corresponding bits in the
-/// destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pavgw))]
-pub unsafe fn _mm_avg_pu16(a: __m64, b: __m64) -> __m64 {
-    pavgw(a, b)
-}
-
-/// Computes the rounded averages of the packed unsigned 16-bit integer
-/// values and writes the averages to the corresponding bits in the
-/// destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pavgw))]
-pub unsafe fn _m_pavgw(a: __m64, b: __m64) -> __m64 {
-    _mm_avg_pu16(a, b)
-}
-
-/// Subtracts the corresponding 8-bit unsigned integer values of the two
-/// 64-bit vector operands and computes the absolute value for each of the
-/// difference. Then sum of the 8 absolute differences is written to the
-/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(psadbw))]
-pub unsafe fn _mm_sad_pu8(a: __m64, b: __m64) -> __m64 {
-    psadbw(a, b)
-}
-
-/// Subtracts the corresponding 8-bit unsigned integer values of the two
-/// 64-bit vector operands and computes the absolute value for each of the
-/// difference. Then sum of the 8 absolute differences is written to the
-/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(psadbw))]
-pub unsafe fn _m_psadbw(a: __m64, b: __m64) -> __m64 {
-    _mm_sad_pu8(a, b)
-}
-
-/// Converts two elements of a 64-bit vector of [2 x i32] into two
-/// floating point values and writes them to the lower 64-bits of the
-/// destination. The remaining higher order elements of the destination are
-/// copied from the corresponding elements in the first operand.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvtpi32_ps(a: __m128, b: __m64) -> __m128 {
-    cvtpi2ps(a, b)
-}
-
-/// Converts two elements of a 64-bit vector of [2 x i32] into two
-/// floating point values and writes them to the lower 64-bits of the
-/// destination. The remaining higher order elements of the destination are
-/// copied from the corresponding elements in the first operand.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvt_pi2ps(a: __m128, b: __m64) -> __m128 {
-    _mm_cvtpi32_ps(a, b)
-}
-
-/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> __m128 {
-    let b = _mm_setzero_si64();
-    let b = _mm_cmpgt_pi8(b, a);
-    let b = _mm_unpacklo_pi8(a, b);
-    _mm_cvtpi16_ps(b)
-}
-
-/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> __m128 {
-    let b = _mm_setzero_si64();
-    let b = _mm_unpacklo_pi8(a, b);
-    _mm_cvtpi16_ps(b)
-}
-
-/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvtpi16_ps(a: __m64) -> __m128 {
-    let b = _mm_setzero_si64();
-    let b = _mm_cmpgt_pi16(b, a);
-    let c = _mm_unpackhi_pi16(a, b);
-    let r = _mm_setzero_ps();
-    let r = cvtpi2ps(r, c);
-    let r = _mm_movelh_ps(r, r);
-    let c = _mm_unpacklo_pi16(a, b);
-    cvtpi2ps(r, c)
-}
-
-/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvtpu16_ps(a: __m64) -> __m128 {
-    let b = _mm_setzero_si64();
-    let c = _mm_unpackhi_pi16(a, b);
-    let r = _mm_setzero_ps();
-    let r = cvtpi2ps(r, c);
-    let r = _mm_movelh_ps(r, r);
-    let c = _mm_unpacklo_pi16(a, b);
-    cvtpi2ps(r, c)
-}
-
-/// Converts the two 32-bit signed integer values from each 64-bit vector
-/// operand of [2 x i32] into a 128-bit vector of [4 x float].
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2ps))]
-pub unsafe fn _mm_cvtpi32x2_ps(a: __m64, b: __m64) -> __m128 {
-    let c = i586::_mm_setzero_ps();
-    let c = _mm_cvtpi32_ps(c, b);
-    let c = i586::_mm_movelh_ps(c, c);
-    _mm_cvtpi32_ps(c, a)
-}
-
-/// Conditionally copies the values from each 8-bit element in the first
-/// 64-bit integer vector operand to the specified memory location, as
-/// specified by the most significant bit in the corresponding element in the
-/// second 64-bit integer vector operand.
-///
-/// To minimize caching, the data is flagged as non-temporal
-/// (unlikely to be used again soon).
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(maskmovq))]
-pub unsafe fn _mm_maskmove_si64(a: __m64, mask: __m64, mem_addr: *mut i8) {
-    maskmovq(a, mask, mem_addr)
-}
-
-/// Conditionally copies the values from each 8-bit element in the first
-/// 64-bit integer vector operand to the specified memory location, as
-/// specified by the most significant bit in the corresponding element in the
-/// second 64-bit integer vector operand.
-///
-/// To minimize caching, the data is flagged as non-temporal
-/// (unlikely to be used again soon).
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(maskmovq))]
-pub unsafe fn _m_maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8) {
-    _mm_maskmove_si64(a, mask, mem_addr)
-}
-
-/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
-/// returns it, as specified by the immediate integer operand.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_extract_pi16(a: __m64, imm2: i32) -> i32 {
-    macro_rules! call {
-        ($imm2:expr) => { pextrw(a, $imm2) as i32 }
-    }
-    constify_imm2!(imm2, call)
-}
-
-/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
-/// returns it, as specified by the immediate integer operand.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _m_pextrw(a: __m64, imm2: i32) -> i32 {
-    macro_rules! call {
-        ($imm2:expr) => { pextrw(a, $imm2) as i32 }
-    }
-    constify_imm2!(imm2, call)
-}
-
-/// Copies data from the 64-bit vector of [4 x i16] to the destination,
-/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
-/// specified by the immediate operand `n`.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_insert_pi16(a: __m64, d: i32, imm2: i32) -> __m64 {
-    macro_rules! call {
-        ($imm2:expr) => { pinsrw(a, d, $imm2) }
-    }
-    constify_imm2!(imm2, call)
-}
-
-/// Copies data from the 64-bit vector of [4 x i16] to the destination,
-/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
-/// specified by the immediate operand `n`.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _m_pinsrw(a: __m64, d: i32, imm2: i32) -> __m64 {
-    macro_rules! call {
-        ($imm2:expr) => { pinsrw(a, d, $imm2) }
-    }
-    constify_imm2!(imm2, call)
-}
-
-/// Takes the most significant bit from each 8-bit element in a 64-bit
-/// integer vector to create a 16-bit mask value. Zero-extends the value to
-/// 32-bit integer and writes it to the destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmovmskb))]
-pub unsafe fn _mm_movemask_pi8(a: __m64) -> i32 {
-    pmovmskb(a)
-}
-
-/// Takes the most significant bit from each 8-bit element in a 64-bit
-/// integer vector to create a 16-bit mask value. Zero-extends the value to
-/// 32-bit integer and writes it to the destination.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pmovmskb))]
-pub unsafe fn _m_pmovmskb(a: __m64) -> i32 {
-    _mm_movemask_pi8(a)
-}
-
-/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
-/// destination, as specified by the immediate value operand.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_shuffle_pi16(a: __m64, imm8: i32) -> __m64 {
-    macro_rules! call {
-        ($imm8:expr) => { pshufw(a, $imm8) }
-    }
-    constify_imm8!(imm8, call)
-}
-
-/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
-/// destination, as specified by the immediate value operand.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _m_pshufw(a: __m64, imm8: i32) -> __m64 {
-    macro_rules! call {
-        ($imm8:expr) => { pshufw(a, $imm8) }
-    }
-    constify_imm8!(imm8, call)
-}
-
-/// Convert the two lower packed single-precision (32-bit) floating-point
-/// elements in `a` to packed 32-bit integers with truncation.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvttps2pi))]
-pub unsafe fn _mm_cvttps_pi32(a: __m128) -> __m64 {
-    cvttps2pi(a)
-}
-
-/// Convert the two lower packed single-precision (32-bit) floating-point
-/// elements in `a` to packed 32-bit integers with truncation.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvttps2pi))]
-pub unsafe fn _mm_cvtt_ps2pi(a: __m128) -> __m64 {
-    _mm_cvttps_pi32(a)
-}
-
-/// Convert the two lower packed single-precision (32-bit) floating-point
-/// elements in `a` to packed 32-bit integers.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvtps_pi32(a: __m128) -> __m64 {
-    cvtps2pi(a)
-}
-
-/// Convert the two lower packed single-precision (32-bit) floating-point
-/// elements in `a` to packed 32-bit integers.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvt_ps2pi(a: __m128) -> __m64 {
-    _mm_cvtps_pi32(a)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in `a` to
-/// packed 16-bit integers.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvtps_pi16(a: __m128) -> __m64 {
-    let b = _mm_cvtps_pi32(a);
-    let a = _mm_movehl_ps(a, a);
-    let c = _mm_cvtps_pi32(a);
-    _mm_packs_pi32(b, c)
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in `a` to
-/// packed 8-bit integers, and returns theem in the lower 4 elements of the
-/// result.
-#[inline]
-#[target_feature(enable = "sse,mmx")]
-#[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvtps_pi8(a: __m128) -> __m64 {
-    let b = _mm_cvtps_pi16(a);
-    let c = _mm_setzero_si64();
-    _mm_packs_pi16(b, c)
-}
-
-#[cfg(test)]
-mod tests {
-    use coresimd::x86::*;
-    use stdsimd_test::simd_test;
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_max_pi16() {
-        let a = _mm_setr_pi16(-1, 6, -3, 8);
-        let b = _mm_setr_pi16(5, -2, 7, -4);
-        let r = _mm_setr_pi16(5, 6, 7, 8);
-
-        assert_eq_m64(r, _mm_max_pi16(a, b));
-        assert_eq_m64(r, _m_pmaxsw(a, b));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_max_pu8() {
-        let a = _mm_setr_pi8(2, 6, 3, 8, 2, 6, 3, 8);
-        let b = _mm_setr_pi8(5, 2, 7, 4, 5, 2, 7, 4);
-        let r = _mm_setr_pi8(5, 6, 7, 8, 5, 6, 7, 8);
-
-        assert_eq_m64(r, _mm_max_pu8(a, b));
-        assert_eq_m64(r, _m_pmaxub(a, b));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_min_pi16() {
-        let a = _mm_setr_pi16(-1, 6, -3, 8);
-        let b = _mm_setr_pi16(5, -2, 7, -4);
-        let r = _mm_setr_pi16(-1, -2, -3, -4);
-
-        assert_eq_m64(r, _mm_min_pi16(a, b));
-        assert_eq_m64(r, _m_pminsw(a, b));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_min_pu8() {
-        let a = _mm_setr_pi8(2, 6, 3, 8, 2, 6, 3, 8);
-        let b = _mm_setr_pi8(5, 2, 7, 4, 5, 2, 7, 4);
-        let r = _mm_setr_pi8(2, 2, 3, 4, 2, 2, 3, 4);
-
-        assert_eq_m64(r, _mm_min_pu8(a, b));
-        assert_eq_m64(r, _m_pminub(a, b));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_mulhi_pu16() {
-        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
-        let r = _mm_mulhi_pu16(a, b);
-        assert_eq_m64(r, _mm_set1_pi16(15));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_m_pmulhuw() {
-        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
-        let r = _m_pmulhuw(a, b);
-        assert_eq_m64(r, _mm_set1_pi16(15));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_avg_pu8() {
-        let (a, b) = (_mm_set1_pi8(3), _mm_set1_pi8(9));
-        let r = _mm_avg_pu8(a, b);
-        assert_eq_m64(r, _mm_set1_pi8(6));
-
-        let r = _m_pavgb(a, b);
-        assert_eq_m64(r, _mm_set1_pi8(6));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_avg_pu16() {
-        let (a, b) = (_mm_set1_pi16(3), _mm_set1_pi16(9));
-        let r = _mm_avg_pu16(a, b);
-        assert_eq_m64(r, _mm_set1_pi16(6));
-
-        let r = _m_pavgw(a, b);
-        assert_eq_m64(r, _mm_set1_pi16(6));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_sad_pu8() {
-        #[cfg_attr(rustfmt, rustfmt_skip)]
-        let a = _mm_setr_pi8(
-            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
-            1, 2, 3, 4,
-        );
-        let b = _mm_setr_pi8(0, 0, 0, 0, 2, 1, 2, 1);
-        let r = _mm_sad_pu8(a, b);
-        assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0));
-
-        let r = _m_psadbw(a, b);
-        assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtpi32_ps() {
-        let a = _mm_setr_ps(0., 0., 3., 4.);
-        let b = _mm_setr_pi32(1, 2);
-        let expected = _mm_setr_ps(1., 2., 3., 4.);
-        let r = _mm_cvtpi32_ps(a, b);
-        assert_eq_m128(r, expected);
-
-        let r = _mm_cvt_pi2ps(a, b);
-        assert_eq_m128(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtpi16_ps() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let expected = _mm_setr_ps(1., 2., 3., 4.);
-        let r = _mm_cvtpi16_ps(a);
-        assert_eq_m128(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtpu16_ps() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let expected = _mm_setr_ps(1., 2., 3., 4.);
-        let r = _mm_cvtpu16_ps(a);
-        assert_eq_m128(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtpi8_ps() {
-        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
-        let expected = _mm_setr_ps(1., 2., 3., 4.);
-        let r = _mm_cvtpi8_ps(a);
-        assert_eq_m128(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtpu8_ps() {
-        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
-        let expected = _mm_setr_ps(1., 2., 3., 4.);
-        let r = _mm_cvtpu8_ps(a);
-        assert_eq_m128(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtpi32x2_ps() {
-        let a = _mm_setr_pi32(1, 2);
-        let b = _mm_setr_pi32(3, 4);
-        let expected = _mm_setr_ps(1., 2., 3., 4.);
-        let r = _mm_cvtpi32x2_ps(a, b);
-        assert_eq_m128(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_maskmove_si64() {
-        let a = _mm_set1_pi8(9);
-        let mask = _mm_setr_pi8(0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0);
-        let mut r = _mm_set1_pi8(0);
-        _mm_maskmove_si64(a, mask, &mut r as *mut _ as *mut i8);
-        let e = _mm_setr_pi8(0, 0, 9, 0, 0, 0, 0, 0);
-        assert_eq_m64(r, e);
-
-        let mut r = _mm_set1_pi8(0);
-        _m_maskmovq(a, mask, &mut r as *mut _ as *mut i8);
-        assert_eq_m64(r, e);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_extract_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let r = _mm_extract_pi16(a, 0);
-        assert_eq!(r, 1);
-        let r = _mm_extract_pi16(a, 1);
-        assert_eq!(r, 2);
-
-        let r = _m_pextrw(a, 1);
-        assert_eq!(r, 2);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_insert_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let r = _mm_insert_pi16(a, 0, 0b0);
-        let expected = _mm_setr_pi16(0, 2, 3, 4);
-        assert_eq_m64(r, expected);
-        let r = _mm_insert_pi16(a, 0, 0b10);
-        let expected = _mm_setr_pi16(1, 2, 0, 4);
-        assert_eq_m64(r, expected);
-
-        let r = _m_pinsrw(a, 0, 0b10);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_movemask_pi8() {
-        let a =
-            _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
-        let r = _mm_movemask_pi8(a);
-        assert_eq!(r, 0b10001);
-
-        let r = _m_pmovmskb(a);
-        assert_eq!(r, 0b10001);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_shuffle_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let r = _mm_shuffle_pi16(a, 0b00_01_01_11);
-        let expected = _mm_setr_pi16(4, 2, 2, 1);
-        assert_eq_m64(r, expected);
-
-        let r = _m_pshufw(a, 0b00_01_01_11);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtps_pi32() {
-        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
-        let r = _mm_setr_pi32(1, 2);
-
-        assert_eq_m64(r, _mm_cvtps_pi32(a));
-        assert_eq_m64(r, _mm_cvt_ps2pi(a));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvttps_pi32() {
-        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
-        let r = _mm_setr_pi32(7, 2);
-
-        assert_eq_m64(r, _mm_cvttps_pi32(a));
-        assert_eq_m64(r, _mm_cvtt_ps2pi(a));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtps_pi16() {
-        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
-        let r = _mm_setr_pi16(7, 2, 3, 4);
-        assert_eq_m64(r, _mm_cvtps_pi16(a));
-    }
-
-    #[simd_test = "sse,mmx"]
-    unsafe fn test_mm_cvtps_pi8() {
-        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
-        let r = _mm_setr_pi8(7, 2, 3, 4, 0, 0, 0, 0);
-        assert_eq_m64(r, _mm_cvtps_pi8(a));
-    }
-}
--- a/library/stdarch/coresimd/x86/i686/sse2.rs
+++ b/library/stdarch/coresimd/x86/i686/sse2.rs
@ -1,225 +0,0 @@
-//! `i686`'s Streaming SIMD Extensions 2 (SSE2)
-
-use coresimd::simd_llvm::simd_extract;
-use coresimd::x86::*;
-use mem;
-
-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
-/// Adds two signed or unsigned 64-bit integer values, returning the
-/// lower 64 bits of the sum.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-#[cfg_attr(test, assert_instr(paddq))]
-pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
-    paddq(a, b)
-}
-
-/// Multiplies 32-bit unsigned integer values contained in the lower bits
-/// of the two 64-bit integer vectors and returns the 64-bit unsigned
-/// product.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-#[cfg_attr(test, assert_instr(pmuludq))]
-pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
-    pmuludq(a, b)
-}
-
-/// Subtracts signed or unsigned 64-bit integer values and writes the
-/// difference to the corresponding bits in the destination.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-#[cfg_attr(test, assert_instr(psubq))]
-pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
-    psubq(a, b)
-}
-
-/// Converts the two signed 32-bit integer elements of a 64-bit vector of
-/// [2 x i32] into two double-precision floating-point values, returned in a
-/// 128-bit vector of [2 x double].
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-#[cfg_attr(test, assert_instr(cvtpi2pd))]
-pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
-    cvtpi2pd(a)
-}
-
-/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
-/// the specified 64-bit integer values.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-// no particular instruction to test
-pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
-    _mm_set_epi64x(mem::transmute(e1), mem::transmute(e0))
-}
-
-/// Initializes both values in a 128-bit vector of [2 x i64] with the
-/// specified 64-bit value.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-// no particular instruction to test
-pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
-    _mm_set_epi64x(mem::transmute(a), mem::transmute(a))
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-/// with the specified 64-bit integral values.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-// no particular instruction to test
-pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
-    _mm_set_epi64x(mem::transmute(e0), mem::transmute(e1))
-}
-
-/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
-/// integer.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-// #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong
-// instr?
-pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 {
-    mem::transmute(simd_extract::<_, i64>(a.as_i64x2(), 0))
-}
-
-/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
-/// upper bits.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-// #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong
-// instr?
-pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i {
-    _mm_set_epi64x(0, mem::transmute(a))
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
-/// returned in a 64-bit vector of [2 x i32].
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-#[cfg_attr(test, assert_instr(cvtpd2pi))]
-pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
-    cvtpd2pi(a)
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
-/// returned in a 64-bit vector of [2 x i32].
-/// If the result of either conversion is inexact, the result is truncated
-/// (rounded towards zero) regardless of the current MXCSR setting.
-#[inline]
-#[target_feature(enable = "sse2,mmx")]
-#[cfg_attr(test, assert_instr(cvttpd2pi))]
-pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 {
-    cvttpd2pi(a)
-}
-
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.mmx.padd.q"]
-    fn paddq(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.pmulu.dq"]
-    fn pmuludq(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.mmx.psub.q"]
-    fn psubq(a: __m64, b: __m64) -> __m64;
-    #[link_name = "llvm.x86.sse.cvtpi2pd"]
-    fn cvtpi2pd(a: __m64) -> __m128d;
-    #[link_name = "llvm.x86.sse.cvtpd2pi"]
-    fn cvtpd2pi(a: __m128d) -> __m64;
-    #[link_name = "llvm.x86.sse.cvttpd2pi"]
-    fn cvttpd2pi(a: __m128d) -> __m64;
-}
-
-#[cfg(test)]
-mod tests {
-    use std::mem;
-
-    use stdsimd_test::simd_test;
-
-    use coresimd::x86::*;
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_add_si64() {
-        let a = 1i64;
-        let b = 2i64;
-        let expected = 3i64;
-        let r = _mm_add_si64(mem::transmute(a), mem::transmute(b));
-        assert_eq!(mem::transmute::<__m64, i64>(r), expected);
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_mul_su32() {
-        let a = _mm_setr_pi32(1, 2);
-        let b = _mm_setr_pi32(3, 4);
-        let expected = 3u64;
-        let r = _mm_mul_su32(a, b);
-        assert_eq_m64(r, mem::transmute(expected));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_sub_si64() {
-        let a = 1i64;
-        let b = 2i64;
-        let expected = -1i64;
-        let r = _mm_sub_si64(mem::transmute(a), mem::transmute(b));
-        assert_eq!(mem::transmute::<__m64, i64>(r), expected);
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_cvtpi32_pd() {
-        let a = _mm_setr_pi32(1, 2);
-        let expected = _mm_setr_pd(1., 2.);
-        let r = _mm_cvtpi32_pd(a);
-        assert_eq_m128d(r, expected);
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_set_epi64() {
-        let r = _mm_set_epi64(mem::transmute(1i64), mem::transmute(2i64));
-        assert_eq_m128i(r, _mm_setr_epi64x(2, 1));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_set1_epi64() {
-        let r = _mm_set1_epi64(mem::transmute(1i64));
-        assert_eq_m128i(r, _mm_setr_epi64x(1, 1));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_setr_epi64() {
-        let r = _mm_setr_epi64(mem::transmute(1i64), mem::transmute(2i64));
-        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_movepi64_pi64() {
-        let r = _mm_movepi64_pi64(_mm_setr_epi64x(5, 0));
-        assert_eq_m64(r, _mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_movpi64_epi64() {
-        let r = _mm_movpi64_epi64(_mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_cvtpd_pi32() {
-        let a = _mm_setr_pd(5., 0.);
-        let r = _mm_cvtpd_pi32(a);
-        assert_eq_m64(r, _mm_setr_pi32(5, 0));
-    }
-
-    #[simd_test = "sse2,mmx"]
-    unsafe fn test_mm_cvttpd_pi32() {
-        use std::{f64, i32};
-
-        let a = _mm_setr_pd(5., 0.);
-        let r = _mm_cvttpd_pi32(a);
-        assert_eq_m64(r, _mm_setr_pi32(5, 0));
-
-        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
-        let r = _mm_cvttpd_pi32(a);
-        assert_eq_m64(r, _mm_setr_pi32(i32::MIN, i32::MIN));
-    }
-}
--- a/library/stdarch/coresimd/x86/i686/sse41.rs
+++ b/library/stdarch/coresimd/x86/i686/sse41.rs
@ -1,240 +0,0 @@
-//! `i686`'s Streaming SIMD Extensions 4.1 (SSE4.1)
-
-use coresimd::v128::*;
-use coresimd::x86::*;
-
-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.sse41.ptestz"]
-    fn ptestz(a: i64x2, mask: i64x2) -> i32;
-    #[link_name = "llvm.x86.sse41.ptestc"]
-    fn ptestc(a: i64x2, mask: i64x2) -> i32;
-    #[link_name = "llvm.x86.sse41.ptestnzc"]
-    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// zeros.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///            operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are all zeros,
-/// * `0` - otherwise.
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
-    ptestz(a.as_i64x2(), mask.as_i64x2())
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// ones.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///            operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are all ones,
-/// * `0` - otherwise.
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
-    ptestc(a.as_i64x2(), mask.as_i64x2())
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-/// neither all zeros nor all ones.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///            operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are neither all zeros nor all ones,
-/// * `0` - otherwise.
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
-    ptestnzc(a.as_i64x2(), mask.as_i64x2())
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// zeros.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///            operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are all zeros,
-/// * `0` - otherwise.
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
-    _mm_testz_si128(a, mask)
-}
-
-/// Tests whether the specified bits in `a` 128-bit integer vector are all
-/// ones.
-///
-/// Argument:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-///
-/// Returns:
-///
-/// * `1` - if the bits specified in the operand are all set to 1,
-/// * `0` - otherwise.
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pcmpeqd))]
-#[cfg_attr(test, assert_instr(ptest))]
-pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
-    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-/// neither all zeros nor all ones.
-///
-/// Arguments:
-///
-/// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// * `mask` - A 128-bit integer vector selecting which bits to test in
-///            operand `a`.
-///
-/// Returns:
-///
-/// * `1` - if the specified bits are neither all zeros nor all ones,
-/// * `0` - otherwise.
-#[inline]
-#[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(ptest))]
-pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
-    _mm_testnzc_si128(a, mask)
-}
-
-#[cfg(test)]
-mod tests {
-    use stdsimd_test::simd_test;
-    use coresimd::x86::*;
-
-    #[simd_test = "sse4.1"]
-    unsafe fn test_mm_testz_si128() {
-        let a = _mm_set1_epi8(1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_testz_si128(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_testz_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b011);
-        let mask = _mm_set1_epi8(0b100);
-        let r = _mm_testz_si128(a, mask);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test = "sse4.1"]
-    unsafe fn test_mm_testc_si128() {
-        let a = _mm_set1_epi8(-1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_testc_si128(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_testc_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b100);
-        let r = _mm_testc_si128(a, mask);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test = "sse4.1"]
-    unsafe fn test_mm_testnzc_si128() {
-        let a = _mm_set1_epi8(0);
-        let mask = _mm_set1_epi8(1);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(-1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b101);
-        let r = _mm_testnzc_si128(a, mask);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test = "sse4.1"]
-    unsafe fn test_mm_test_all_zeros() {
-        let a = _mm_set1_epi8(1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_test_all_zeros(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_test_all_zeros(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b011);
-        let mask = _mm_set1_epi8(0b100);
-        let r = _mm_test_all_zeros(a, mask);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test = "sse4.1"]
-    unsafe fn test_mm_test_all_ones() {
-        let a = _mm_set1_epi8(-1);
-        let r = _mm_test_all_ones(a);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let r = _mm_test_all_ones(a);
-        assert_eq!(r, 0);
-    }
-
-    #[simd_test = "sse4.1"]
-    unsafe fn test_mm_test_mix_ones_zeros() {
-        let a = _mm_set1_epi8(0);
-        let mask = _mm_set1_epi8(1);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(-1);
-        let mask = _mm_set1_epi8(0);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 0);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b110);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 1);
-        let a = _mm_set1_epi8(0b101);
-        let mask = _mm_set1_epi8(0b101);
-        let r = _mm_test_mix_ones_zeros(a, mask);
-        assert_eq!(r, 0);
-    }
-}
--- a/library/stdarch/coresimd/x86/i686/sse42.rs
+++ b/library/stdarch/coresimd/x86/i686/sse42.rs
@ -1,35 +0,0 @@
-//! `i686`'s Streaming SIMD Extensions 4.2 (SSE4.2)
-
-use coresimd::simd_llvm::*;
-use coresimd::v128::*;
-use coresimd::x86::*;
-
-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
-/// Compare packed 64-bit integers in `a` and `b` for greater-than,
-/// return the results.
-#[inline]
-#[target_feature(enable = "sse4.2")]
-#[cfg_attr(test, assert_instr(pcmpgtq))]
-pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
-    mem::transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
-}
-
-#[cfg(test)]
-mod tests {
-    use coresimd::x86::*;
-
-    use stdsimd_test::simd_test;
-
-    #[simd_test = "sse4.2"]
-    unsafe fn test_mm_cmpgt_epi64() {
-        let a = _mm_setr_epi64x(0, 0x2a);
-        let b = _mm_set1_epi64x(0x00);
-        let i = _mm_cmpgt_epi64(a, b);
-        assert_eq_m128i(
-            i,
-            _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64),
-        );
-    }
-}
--- a/library/stdarch/coresimd/x86/i686/ssse3.rs
+++ b/library/stdarch/coresimd/x86/i686/ssse3.rs
@ -1,361 +0,0 @@
-//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
-
-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
-use coresimd::x86::*;
-
-/// Compute the absolute value of packed 8-bit integers in `a` and
-/// return the unsigned results.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(pabsb))]
-pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
-    pabsb(a)
-}
-
-/// Compute the absolute value of packed 8-bit integers in `a`, and return the
-/// unsigned results.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(pabsw))]
-pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
-    pabsw(a)
-}
-
-/// Compute the absolute value of packed 32-bit integers in `a`, and return the
-/// unsigned results.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(pabsd))]
-pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
-    pabsd(a)
-}
-
-/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
-/// the corresponding 8-bit element of `b`, and return the results
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(pshufb))]
-pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
-    pshufb(a, b)
-}
-
-/// Concatenates the two 64-bit integer vector operands, and right-shifts
-/// the result by the number of bytes specified in the immediate operand.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(palignr, n = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            palignrb(a, b, $imm8)
-        }
-    }
-    constify_imm8!(n, call)
-}
-
-/// Horizontally add the adjacent pairs of values contained in 2 packed
-/// 64-bit vectors of [4 x i16].
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(phaddw))]
-pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
-    phaddw(a, b)
-}
-
-/// Horizontally add the adjacent pairs of values contained in 2 packed
-/// 64-bit vectors of [2 x i32].
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(phaddd))]
-pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
-    phaddd(a, b)
-}
-
-/// Horizontally add the adjacent pairs of values contained in 2 packed
-/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
-/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(phaddsw))]
-pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
-    phaddsw(a, b)
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 64-bit vectors of [4 x i16].
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(phsubw))]
-pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
-    phsubw(a, b)
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 64-bit vectors of [2 x i32].
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(phsubd))]
-pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
-    phsubd(a, b)
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
-/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
-/// saturated to 8000h.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(phsubsw))]
-pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
-    phsubsw(a, b)
-}
-
-/// Multiplies corresponding pairs of packed 8-bit unsigned integer
-/// values contained in the first source operand and packed 8-bit signed
-/// integer values contained in the second source operand, adds pairs of
-/// contiguous products with signed saturation, and writes the 16-bit sums to
-/// the corresponding bits in the destination.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(pmaddubsw))]
-pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
-    pmaddubsw(a, b)
-}
-
-/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
-/// products to the 18 most significant bits by right-shifting, rounds the
-/// truncated value by adding 1, and writes bits [16:1] to the destination.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(pmulhrsw))]
-pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
-    pmulhrsw(a, b)
-}
-
-/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
-/// integer in `b` is negative, and return the results.
-/// Element in result are zeroed out when the corresponding element in `b` is
-/// zero.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(psignb))]
-pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
-    psignb(a, b)
-}
-
-/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
-/// integer in `b` is negative, and return the results.
-/// Element in result are zeroed out when the corresponding element in `b` is
-/// zero.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(psignw))]
-pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
-    psignw(a, b)
-}
-
-/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
-/// integer in `b` is negative, and return the results.
-/// Element in result are zeroed out when the corresponding element in `b` is
-/// zero.
-#[inline]
-#[target_feature(enable = "ssse3,mmx")]
-#[cfg_attr(test, assert_instr(psignd))]
-pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
-    psignd(a, b)
-}
-
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.ssse3.pabs.b"]
-    fn pabsb(a: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.pabs.w"]
-    fn pabsw(a: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.pabs.d"]
-    fn pabsd(a: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.pshuf.b"]
-    fn pshufb(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.mmx.palignr.b"]
-    fn palignrb(a: __m64, b: __m64, n: u8) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.phadd.w"]
-    fn phaddw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.phadd.d"]
-    fn phaddd(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.phadd.sw"]
-    fn phaddsw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.phsub.w"]
-    fn phsubw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.phsub.d"]
-    fn phsubd(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.phsub.sw"]
-    fn phsubsw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw"]
-    fn pmaddubsw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.pmul.hr.sw"]
-    fn pmulhrsw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.psign.b"]
-    fn psignb(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.psign.w"]
-    fn psignw(a: __m64, b: __m64) -> __m64;
-
-    #[link_name = "llvm.x86.ssse3.psign.d"]
-    fn psignd(a: __m64, b: __m64) -> __m64;
-}
-
-#[cfg(test)]
-mod tests {
-    use stdsimd_test::simd_test;
-
-    use coresimd::x86::*;
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_abs_pi8() {
-        let r = _mm_abs_pi8(_mm_set1_pi8(-5));
-        assert_eq_m64(r, _mm_set1_pi8(5));
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_abs_pi16() {
-        let r = _mm_abs_pi16(_mm_set1_pi16(-5));
-        assert_eq_m64(r, _mm_set1_pi16(5));
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_abs_pi32() {
-        let r = _mm_abs_pi32(_mm_set1_pi32(-5));
-        assert_eq_m64(r, _mm_set1_pi32(5));
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_shuffle_pi8() {
-        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_setr_pi8(4, 128u8 as i8, 4, 3, 24, 12, 6, 19);
-        let expected = _mm_setr_pi8(5, 0, 5, 4, 1, 5, 7, 4);
-        let r = _mm_shuffle_pi8(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_alignr_pi8() {
-        let a = _mm_setr_pi32(0x89ABCDEF_u32 as i32, 0x01234567_u32 as i32);
-        let b = _mm_setr_pi32(0xBBAA9988_u32 as i32, 0xFFDDEECC_u32 as i32);
-        let r = _mm_alignr_pi8(a, b, 4);
-        assert_eq_m64(r, ::std::mem::transmute(0x89abcdefffddeecc_u64));
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_hadd_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let b = _mm_setr_pi16(4, 128, 4, 3);
-        let expected = _mm_setr_pi16(3, 7, 132, 7);
-        let r = _mm_hadd_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_hadd_pi32() {
-        let a = _mm_setr_pi32(1, 2);
-        let b = _mm_setr_pi32(4, 128);
-        let expected = _mm_setr_pi32(3, 132);
-        let r = _mm_hadd_pi32(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_hadds_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let b = _mm_setr_pi16(32767, 1, -32768, -1);
-        let expected = _mm_setr_pi16(3, 7, 32767, -32768);
-        let r = _mm_hadds_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_hsub_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let b = _mm_setr_pi16(4, 128, 4, 3);
-        let expected = _mm_setr_pi16(-1, -1, -124, 1);
-        let r = _mm_hsub_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_hsub_pi32() {
-        let a = _mm_setr_pi32(1, 2);
-        let b = _mm_setr_pi32(4, 128);
-        let expected = _mm_setr_pi32(-1, -124);
-        let r = _mm_hsub_pi32(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_hsubs_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let b = _mm_setr_pi16(4, 128, 4, 3);
-        let expected = _mm_setr_pi16(-1, -1, -124, 1);
-        let r = _mm_hsubs_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_maddubs_pi16() {
-        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
-        let b = _mm_setr_pi8(4, 63, 4, 3, 24, 12, 6, 19);
-        let expected = _mm_setr_pi16(130, 24, 192, 194);
-        let r = _mm_maddubs_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_mulhrs_pi16() {
-        let a = _mm_setr_pi16(1, 2, 3, 4);
-        let b = _mm_setr_pi16(4, 32767, -1, -32768);
-        let expected = _mm_setr_pi16(0, 2, 0, -4);
-        let r = _mm_mulhrs_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_sign_pi8() {
-        let a = _mm_setr_pi8(1, 2, 3, 4, -5, -6, 7, 8);
-        let b = _mm_setr_pi8(4, 64, 0, 3, 1, -1, -2, 1);
-        let expected = _mm_setr_pi8(1, 2, 0, 4, -5, 6, -7, 8);
-        let r = _mm_sign_pi8(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_sign_pi16() {
-        let a = _mm_setr_pi16(-1, 2, 3, 4);
-        let b = _mm_setr_pi16(1, -1, 1, 0);
-        let expected = _mm_setr_pi16(-1, -2, 3, 0);
-        let r = _mm_sign_pi16(a, b);
-        assert_eq_m64(r, expected);
-    }
-
-    #[simd_test = "ssse3,mmx"]
-    unsafe fn test_mm_sign_pi32() {
-        let a = _mm_setr_pi32(-1, 2);
-        let b = _mm_setr_pi32(1, 0);
-        let expected = _mm_setr_pi32(-1, 0);
-        let r = _mm_sign_pi32(a, b);
-        assert_eq_m64(r, expected);
-    }
-}
--- a/library/stdarch/coresimd/x86/i686/mmx.rs
+++ b/library/stdarch/coresimd/x86/i686/mmx.rs
--- a/library/stdarch/coresimd/x86/mod.rs
+++ b/library/stdarch/coresimd/x86/mod.rs
@ -327,7 +327,7 @@ pub use self::test::*;

 #[doc(hidden)]
 #[allow(non_camel_case_types)]
-trait m128iExt: Sized {
+pub(crate) trait m128iExt: Sized {
    fn as_m128i(self) -> __m128i;

    #[inline]
@ -380,7 +380,7 @@ impl m128iExt for __m128i {

 #[doc(hidden)]
 #[allow(non_camel_case_types)]
-trait m256iExt: Sized {
+pub(crate) trait m256iExt: Sized {
    fn as_m256i(self) -> __m256i;

    #[inline]
@ -431,21 +431,69 @@ impl m256iExt for __m256i {
    }
 }

-mod i386;
-pub use self::i386::*;

-// x86 w/o sse2
-mod i586;
-pub use self::i586::*;
+mod eflags;
+pub use self::eflags::*;

-// `i686` is `i586 + sse2`.
-//
-// This module is not available for `i586` targets,
-// but available for all `i686` targets by default
-mod i686;
-pub use self::i686::*;
+#[cfg(dont_compile_me)] // TODO: need to upstream `fxsr` target feature
+mod fxsr;
+#[cfg(dont_compile_me)] // TODO: need to upstream `fxsr` target feature
+pub use self::fxsr::*;

-#[cfg(target_arch = "x86_64")]
-mod x86_64;
-#[cfg(target_arch = "x86_64")]
-pub use self::x86_64::*;
+mod bswap;
+pub use self::bswap::*;
+
+mod rdtsc;
+pub use self::rdtsc::*;
+
+mod cpuid;
+pub use self::cpuid::*;
+mod xsave;
+pub use self::xsave::*;
+
+mod sse;
+pub use self::sse::*;
+mod sse2;
+pub use self::sse2::*;
+mod sse3;
+pub use self::sse3::*;
+mod ssse3;
+pub use self::ssse3::*;
+mod sse41;
+pub use self::sse41::*;
+mod sse42;
+pub use self::sse42::*;
+mod avx;
+pub use self::avx::*;
+mod avx2;
+pub use self::avx2::*;
+
+mod abm;
+pub use self::abm::*;
+mod bmi;
+pub use self::bmi::*;
+
+mod bmi2;
+pub use self::bmi2::*;
+
+#[cfg(not(feature = "intel_sde"))]
+mod sse4a;
+#[cfg(not(feature = "intel_sde"))]
+pub use self::sse4a::*;
+
+#[cfg(not(feature = "intel_sde"))]
+mod tbm;
+#[cfg(not(feature = "intel_sde"))]
+pub use self::tbm::*;
+
+mod mmx;
+pub use self::mmx::*;
+
+mod pclmulqdq;
+pub use self::pclmulqdq::*;
+
+mod aes;
+pub use self::aes::*;
+
+mod rdrand;
+pub use self::rdrand::*;
--- a/library/stdarch/coresimd/x86/i686/pclmulqdq.rs
+++ b/library/stdarch/coresimd/x86/i686/pclmulqdq.rs
--- a/library/stdarch/coresimd/x86/i686/rdrand.rs
+++ b/library/stdarch/coresimd/x86/i686/rdrand.rs
--- a/library/stdarch/coresimd/x86/i386/rdtsc.rs
+++ b/library/stdarch/coresimd/x86/i386/rdtsc.rs
@ -54,7 +54,7 @@ extern "C" {
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
-    use coresimd::x86::i386::rdtsc;
+    use coresimd::x86::rdtsc;

    #[simd_test = "sse2"]
    unsafe fn _rdtsc() {
--- a/library/stdarch/coresimd/x86/i586/sse.rs
+++ b/library/stdarch/coresimd/x86/i586/sse.rs
@ -1680,6 +1680,38 @@ extern "C" {
    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
    #[link_name = "llvm.x86.mmx.movnt.dq"]
    fn movntdq(a: *mut __m64, b: __m64);
+    #[link_name = "llvm.x86.sse.cvtpi2ps"]
+    fn cvtpi2ps(a: __m128, b: __m64) -> __m128;
+    #[link_name = "llvm.x86.mmx.maskmovq"]
+    fn maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8);
+    #[link_name = "llvm.x86.mmx.pextr.w"]
+    fn pextrw(a: __m64, imm8: i32) -> i32;
+    #[link_name = "llvm.x86.mmx.pinsr.w"]
+    fn pinsrw(a: __m64, d: i32, imm8: i32) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmovmskb"]
+    fn pmovmskb(a: __m64) -> i32;
+    #[link_name = "llvm.x86.sse.pshuf.w"]
+    fn pshufw(a: __m64, imm8: i8) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmaxs.w"]
+    fn pmaxsw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmaxu.b"]
+    fn pmaxub(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmins.w"]
+    fn pminsw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pminu.b"]
+    fn pminub(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmulhu.w"]
+    fn pmulhuw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pavg.b"]
+    fn pavgb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pavg.w"]
+    fn pavgw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psad.bw"]
+    fn psadbw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.sse.cvtps2pi"]
+    fn cvtps2pi(a: __m128) -> __m64;
+    #[link_name = "llvm.x86.sse.cvttps2pi"]
+    fn cvttps2pi(a: __m128) -> __m64;
 }

 /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
@ -1702,6 +1734,432 @@ pub unsafe fn _mm_stream_pi(mem_addr: *mut __m64, a: __m64) {
    movntdq(mem_addr, a)
 }

+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+pub unsafe fn _mm_max_pi16(a: __m64, b: __m64) -> __m64 {
+    pmaxsw(a, b)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxsw))]
+pub unsafe fn _m_pmaxsw(a: __m64, b: __m64) -> __m64 {
+    _mm_max_pi16(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+pub unsafe fn _mm_max_pu8(a: __m64, b: __m64) -> __m64 {
+    pmaxub(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// greatest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmaxub))]
+pub unsafe fn _m_pmaxub(a: __m64, b: __m64) -> __m64 {
+    _mm_max_pu8(a, b)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminsw))]
+pub unsafe fn _mm_min_pi16(a: __m64, b: __m64) -> __m64 {
+    pminsw(a, b)
+}
+
+/// Compares the packed 16-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminsw))]
+pub unsafe fn _m_pminsw(a: __m64, b: __m64) -> __m64 {
+    _mm_min_pi16(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminub))]
+pub unsafe fn _mm_min_pu8(a: __m64, b: __m64) -> __m64 {
+    pminub(a, b)
+}
+
+/// Compares the packed 8-bit signed integers of `a` and `b` writing the
+/// smallest value into the result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pminub))]
+pub unsafe fn _m_pminub(a: __m64, b: __m64) -> __m64 {
+    _mm_min_pu8(a, b)
+}
+
+/// Multiplies packed 16-bit unsigned integer values and writes the
+/// high-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+pub unsafe fn _mm_mulhi_pu16(a: __m64, b: __m64) -> __m64 {
+    pmulhuw(a, b)
+}
+
+/// Multiplies packed 16-bit unsigned integer values and writes the
+/// high-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmulhuw))]
+pub unsafe fn _m_pmulhuw(a: __m64, b: __m64) -> __m64 {
+    _mm_mulhi_pu16(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 8-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgb))]
+pub unsafe fn _mm_avg_pu8(a: __m64, b: __m64) -> __m64 {
+    pavgb(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 8-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgb))]
+pub unsafe fn _m_pavgb(a: __m64, b: __m64) -> __m64 {
+    _mm_avg_pu8(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 16-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgw))]
+pub unsafe fn _mm_avg_pu16(a: __m64, b: __m64) -> __m64 {
+    pavgw(a, b)
+}
+
+/// Computes the rounded averages of the packed unsigned 16-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pavgw))]
+pub unsafe fn _m_pavgw(a: __m64, b: __m64) -> __m64 {
+    _mm_avg_pu16(a, b)
+}
+
+/// Subtracts the corresponding 8-bit unsigned integer values of the two
+/// 64-bit vector operands and computes the absolute value for each of the
+/// difference. Then sum of the 8 absolute differences is written to the
+/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(psadbw))]
+pub unsafe fn _mm_sad_pu8(a: __m64, b: __m64) -> __m64 {
+    psadbw(a, b)
+}
+
+/// Subtracts the corresponding 8-bit unsigned integer values of the two
+/// 64-bit vector operands and computes the absolute value for each of the
+/// difference. Then sum of the 8 absolute differences is written to the
+/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(psadbw))]
+pub unsafe fn _m_psadbw(a: __m64, b: __m64) -> __m64 {
+    _mm_sad_pu8(a, b)
+}
+
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi32_ps(a: __m128, b: __m64) -> __m128 {
+    cvtpi2ps(a, b)
+}
+
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvt_pi2ps(a: __m128, b: __m64) -> __m128 {
+    _mm_cvtpi32_ps(a, b)
+}
+
+/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let b = _mm_cmpgt_pi8(b, a);
+    let b = _mm_unpacklo_pi8(a, b);
+    _mm_cvtpi16_ps(b)
+}
+
+/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let b = _mm_unpacklo_pi8(a, b);
+    _mm_cvtpi16_ps(b)
+}
+
+/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi16_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let b = _mm_cmpgt_pi16(b, a);
+    let c = _mm_unpackhi_pi16(a, b);
+    let r = _mm_setzero_ps();
+    let r = cvtpi2ps(r, c);
+    let r = _mm_movelh_ps(r, r);
+    let c = _mm_unpacklo_pi16(a, b);
+    cvtpi2ps(r, c)
+}
+
+/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpu16_ps(a: __m64) -> __m128 {
+    let b = _mm_setzero_si64();
+    let c = _mm_unpackhi_pi16(a, b);
+    let r = _mm_setzero_ps();
+    let r = cvtpi2ps(r, c);
+    let r = _mm_movelh_ps(r, r);
+    let c = _mm_unpacklo_pi16(a, b);
+    cvtpi2ps(r, c)
+}
+
+/// Converts the two 32-bit signed integer values from each 64-bit vector
+/// operand of [2 x i32] into a 128-bit vector of [4 x float].
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvtpi32x2_ps(a: __m64, b: __m64) -> __m128 {
+    let c = _mm_setzero_ps();
+    let c = _mm_cvtpi32_ps(c, b);
+    let c = _mm_movelh_ps(c, c);
+    _mm_cvtpi32_ps(c, a)
+}
+
+/// Conditionally copies the values from each 8-bit element in the first
+/// 64-bit integer vector operand to the specified memory location, as
+/// specified by the most significant bit in the corresponding element in the
+/// second 64-bit integer vector operand.
+///
+/// To minimize caching, the data is flagged as non-temporal
+/// (unlikely to be used again soon).
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(maskmovq))]
+pub unsafe fn _mm_maskmove_si64(a: __m64, mask: __m64, mem_addr: *mut i8) {
+    maskmovq(a, mask, mem_addr)
+}
+
+/// Conditionally copies the values from each 8-bit element in the first
+/// 64-bit integer vector operand to the specified memory location, as
+/// specified by the most significant bit in the corresponding element in the
+/// second 64-bit integer vector operand.
+///
+/// To minimize caching, the data is flagged as non-temporal
+/// (unlikely to be used again soon).
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(maskmovq))]
+pub unsafe fn _m_maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8) {
+    _mm_maskmove_si64(a, mask, mem_addr)
+}
+
+/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+/// returns it, as specified by the immediate integer operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_extract_pi16(a: __m64, imm2: i32) -> i32 {
+    macro_rules! call {
+        ($imm2:expr) => { pextrw(a, $imm2) as i32 }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+/// returns it, as specified by the immediate integer operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _m_pextrw(a: __m64, imm2: i32) -> i32 {
+    macro_rules! call {
+        ($imm2:expr) => { pextrw(a, $imm2) as i32 }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Copies data from the 64-bit vector of [4 x i16] to the destination,
+/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
+/// specified by the immediate operand `n`.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_insert_pi16(a: __m64, d: i32, imm2: i32) -> __m64 {
+    macro_rules! call {
+        ($imm2:expr) => { pinsrw(a, d, $imm2) }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Copies data from the 64-bit vector of [4 x i16] to the destination,
+/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
+/// specified by the immediate operand `n`.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _m_pinsrw(a: __m64, d: i32, imm2: i32) -> __m64 {
+    macro_rules! call {
+        ($imm2:expr) => { pinsrw(a, d, $imm2) }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+/// integer vector to create a 16-bit mask value. Zero-extends the value to
+/// 32-bit integer and writes it to the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+pub unsafe fn _mm_movemask_pi8(a: __m64) -> i32 {
+    pmovmskb(a)
+}
+
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+/// integer vector to create a 16-bit mask value. Zero-extends the value to
+/// 32-bit integer and writes it to the destination.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+pub unsafe fn _m_pmovmskb(a: __m64) -> i32 {
+    _mm_movemask_pi8(a)
+}
+
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// destination, as specified by the immediate value operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_shuffle_pi16(a: __m64, imm8: i32) -> __m64 {
+    macro_rules! call {
+        ($imm8:expr) => { pshufw(a, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// destination, as specified by the immediate value operand.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _m_pshufw(a: __m64, imm8: i32) -> __m64 {
+    macro_rules! call {
+        ($imm8:expr) => { pshufw(a, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvttps_pi32(a: __m128) -> __m64 {
+    cvttps2pi(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvtt_ps2pi(a: __m128) -> __m64 {
+    _mm_cvttps_pi32(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi32(a: __m128) -> __m64 {
+    cvtps2pi(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvt_ps2pi(a: __m128) -> __m64 {
+    _mm_cvtps_pi32(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 16-bit integers.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi16(a: __m128) -> __m64 {
+    let b = _mm_cvtps_pi32(a);
+    let a = _mm_movehl_ps(a, a);
+    let c = _mm_cvtps_pi32(a);
+    _mm_packs_pi32(b, c)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 8-bit integers, and returns theem in the lower 4 elements of the
+/// result.
+#[inline]
+#[target_feature(enable = "sse,mmx")]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi8(a: __m128) -> __m64 {
+    let b = _mm_cvtps_pi16(a);
+    let c = _mm_setzero_si64();
+    _mm_packs_pi16(b, c)
+}
+
 #[cfg(test)]
 mod tests {
    use std::mem::transmute;
@ -3121,4 +3579,240 @@ mod tests {
        _mm_stream_pi(&mut *mem as *mut _ as *mut _, a);
        assert_eq_m64(a, *mem);
    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_max_pi16() {
+        let a = _mm_setr_pi16(-1, 6, -3, 8);
+        let b = _mm_setr_pi16(5, -2, 7, -4);
+        let r = _mm_setr_pi16(5, 6, 7, 8);
+
+        assert_eq_m64(r, _mm_max_pi16(a, b));
+        assert_eq_m64(r, _m_pmaxsw(a, b));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_max_pu8() {
+        let a = _mm_setr_pi8(2, 6, 3, 8, 2, 6, 3, 8);
+        let b = _mm_setr_pi8(5, 2, 7, 4, 5, 2, 7, 4);
+        let r = _mm_setr_pi8(5, 6, 7, 8, 5, 6, 7, 8);
+
+        assert_eq_m64(r, _mm_max_pu8(a, b));
+        assert_eq_m64(r, _m_pmaxub(a, b));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_min_pi16() {
+        let a = _mm_setr_pi16(-1, 6, -3, 8);
+        let b = _mm_setr_pi16(5, -2, 7, -4);
+        let r = _mm_setr_pi16(-1, -2, -3, -4);
+
+        assert_eq_m64(r, _mm_min_pi16(a, b));
+        assert_eq_m64(r, _m_pminsw(a, b));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_min_pu8() {
+        let a = _mm_setr_pi8(2, 6, 3, 8, 2, 6, 3, 8);
+        let b = _mm_setr_pi8(5, 2, 7, 4, 5, 2, 7, 4);
+        let r = _mm_setr_pi8(2, 2, 3, 4, 2, 2, 3, 4);
+
+        assert_eq_m64(r, _mm_min_pu8(a, b));
+        assert_eq_m64(r, _m_pminub(a, b));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_mulhi_pu16() {
+        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
+        let r = _mm_mulhi_pu16(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(15));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_m_pmulhuw() {
+        let (a, b) = (_mm_set1_pi16(1000), _mm_set1_pi16(1001));
+        let r = _m_pmulhuw(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(15));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_avg_pu8() {
+        let (a, b) = (_mm_set1_pi8(3), _mm_set1_pi8(9));
+        let r = _mm_avg_pu8(a, b);
+        assert_eq_m64(r, _mm_set1_pi8(6));
+
+        let r = _m_pavgb(a, b);
+        assert_eq_m64(r, _mm_set1_pi8(6));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_avg_pu16() {
+        let (a, b) = (_mm_set1_pi16(3), _mm_set1_pi16(9));
+        let r = _mm_avg_pu16(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(6));
+
+        let r = _m_pavgw(a, b);
+        assert_eq_m64(r, _mm_set1_pi16(6));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_sad_pu8() {
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let a = _mm_setr_pi8(
+            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+            1, 2, 3, 4,
+        );
+        let b = _mm_setr_pi8(0, 0, 0, 0, 2, 1, 2, 1);
+        let r = _mm_sad_pu8(a, b);
+        assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0));
+
+        let r = _m_psadbw(a, b);
+        assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtpi32_ps() {
+        let a = _mm_setr_ps(0., 0., 3., 4.);
+        let b = _mm_setr_pi32(1, 2);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi32_ps(a, b);
+        assert_eq_m128(r, expected);
+
+        let r = _mm_cvt_pi2ps(a, b);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtpi16_ps() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi16_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtpu16_ps() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpu16_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtpi8_ps() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi8_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtpu8_ps() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpu8_ps(a);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtpi32x2_ps() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(3, 4);
+        let expected = _mm_setr_ps(1., 2., 3., 4.);
+        let r = _mm_cvtpi32x2_ps(a, b);
+        assert_eq_m128(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_maskmove_si64() {
+        let a = _mm_set1_pi8(9);
+        let mask = _mm_setr_pi8(0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0);
+        let mut r = _mm_set1_pi8(0);
+        _mm_maskmove_si64(a, mask, &mut r as *mut _ as *mut i8);
+        let e = _mm_setr_pi8(0, 0, 9, 0, 0, 0, 0, 0);
+        assert_eq_m64(r, e);
+
+        let mut r = _mm_set1_pi8(0);
+        _m_maskmovq(a, mask, &mut r as *mut _ as *mut i8);
+        assert_eq_m64(r, e);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_extract_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let r = _mm_extract_pi16(a, 0);
+        assert_eq!(r, 1);
+        let r = _mm_extract_pi16(a, 1);
+        assert_eq!(r, 2);
+
+        let r = _m_pextrw(a, 1);
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_insert_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let r = _mm_insert_pi16(a, 0, 0b0);
+        let expected = _mm_setr_pi16(0, 2, 3, 4);
+        assert_eq_m64(r, expected);
+        let r = _mm_insert_pi16(a, 0, 0b10);
+        let expected = _mm_setr_pi16(1, 2, 0, 4);
+        assert_eq_m64(r, expected);
+
+        let r = _m_pinsrw(a, 0, 0b10);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_movemask_pi8() {
+        let a =
+            _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
+        let r = _mm_movemask_pi8(a);
+        assert_eq!(r, 0b10001);
+
+        let r = _m_pmovmskb(a);
+        assert_eq!(r, 0b10001);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_shuffle_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let r = _mm_shuffle_pi16(a, 0b00_01_01_11);
+        let expected = _mm_setr_pi16(4, 2, 2, 1);
+        assert_eq_m64(r, expected);
+
+        let r = _m_pshufw(a, 0b00_01_01_11);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtps_pi32() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi32(1, 2);
+
+        assert_eq_m64(r, _mm_cvtps_pi32(a));
+        assert_eq_m64(r, _mm_cvt_ps2pi(a));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvttps_pi32() {
+        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi32(7, 2);
+
+        assert_eq_m64(r, _mm_cvttps_pi32(a));
+        assert_eq_m64(r, _mm_cvtt_ps2pi(a));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtps_pi16() {
+        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi16(7, 2, 3, 4);
+        assert_eq_m64(r, _mm_cvtps_pi16(a));
+    }
+
+    #[simd_test = "sse,mmx"]
+    unsafe fn test_mm_cvtps_pi8() {
+        let a = _mm_setr_ps(7.0, 2.0, 3.0, 4.0);
+        let r = _mm_setr_pi8(7, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq_m64(r, _mm_cvtps_pi8(a));
+    }
 }
--- a/library/stdarch/coresimd/x86/i586/sse2.rs
+++ b/library/stdarch/coresimd/x86/i586/sse2.rs
@ -2213,6 +2213,113 @@ pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
    simd_shuffle2(a, b, [0, 2])
 }

+/// Adds two signed or unsigned 64-bit integer values, returning the
+/// lower 64 bits of the sum.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(paddq))]
+pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
+    paddq(a, b)
+}
+
+/// Multiplies 32-bit unsigned integer values contained in the lower bits
+/// of the two 64-bit integer vectors and returns the 64-bit unsigned
+/// product.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(pmuludq))]
+pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
+    pmuludq2(a, b)
+}
+
+/// Subtracts signed or unsigned 64-bit integer values and writes the
+/// difference to the corresponding bits in the destination.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(psubq))]
+pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
+    psubq(a, b)
+}
+
+/// Converts the two signed 32-bit integer elements of a 64-bit vector of
+/// [2 x i32] into two double-precision floating-point values, returned in a
+/// 128-bit vector of [2 x double].
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(cvtpi2pd))]
+pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
+    cvtpi2pd(a)
+}
+
+/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+/// the specified 64-bit integer values.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// no particular instruction to test
+pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
+    _mm_set_epi64x(mem::transmute(e1), mem::transmute(e0))
+}
+
+/// Initializes both values in a 128-bit vector of [2 x i64] with the
+/// specified 64-bit value.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// no particular instruction to test
+pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
+    _mm_set_epi64x(mem::transmute(a), mem::transmute(a))
+}
+
+/// Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 64-bit integral values.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// no particular instruction to test
+pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
+    _mm_set_epi64x(mem::transmute(e0), mem::transmute(e1))
+}
+
+/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+/// integer.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong
+// instr?
+pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 {
+    mem::transmute(simd_extract::<_, i64>(a.as_i64x2(), 0))
+}
+
+/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+/// upper bits.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+// #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong
+// instr?
+pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i {
+    _mm_set_epi64x(0, mem::transmute(a))
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in a 64-bit vector of [2 x i32].
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(cvtpd2pi))]
+pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
+    cvtpd2pi(a)
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in a 64-bit vector of [2 x i32].
+/// If the result of either conversion is inexact, the result is truncated
+/// (rounded towards zero) regardless of the current MXCSR setting.
+#[inline]
+#[target_feature(enable = "sse2,mmx")]
+#[cfg_attr(test, assert_instr(cvttpd2pi))]
+pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 {
+    cvttpd2pi(a)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
    #[link_name = "llvm.x86.sse2.pause"]
@ -2371,11 +2478,23 @@ extern "C" {
    fn storeudq(mem_addr: *mut i8, a: __m128i);
    #[link_name = "llvm.x86.sse2.storeu.pd"]
    fn storeupd(mem_addr: *mut i8, a: __m128d);
+    #[link_name = "llvm.x86.mmx.padd.q"]
+    fn paddq(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmulu.dq"]
+    fn pmuludq2(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.psub.q"]
+    fn psubq(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.sse.cvtpi2pd"]
+    fn cvtpi2pd(a: __m64) -> __m128d;
+    #[link_name = "llvm.x86.sse.cvtpd2pi"]
+    fn cvtpd2pi(a: __m128d) -> __m64;
+    #[link_name = "llvm.x86.sse.cvttpd2pi"]
+    fn cvttpd2pi(a: __m128d) -> __m64;
 }

 #[cfg(test)]
 mod tests {
-    use std::mem::transmute;
+    use std::mem::{self, transmute};
    use std::f64::{self, NAN};
    use std::f32;
    use std::i32;
@ -4452,4 +4571,89 @@ mod tests {
        let r = _mm_castsi128_ps(a);
        assert_eq_m128(r, expected);
    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_add_si64() {
+        let a = 1i64;
+        let b = 2i64;
+        let expected = 3i64;
+        let r = _mm_add_si64(mem::transmute(a), mem::transmute(b));
+        assert_eq!(mem::transmute::<__m64, i64>(r), expected);
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_mul_su32() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(3, 4);
+        let expected = 3u64;
+        let r = _mm_mul_su32(a, b);
+        assert_eq_m64(r, mem::transmute(expected));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_sub_si64() {
+        let a = 1i64;
+        let b = 2i64;
+        let expected = -1i64;
+        let r = _mm_sub_si64(mem::transmute(a), mem::transmute(b));
+        assert_eq!(mem::transmute::<__m64, i64>(r), expected);
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_cvtpi32_pd() {
+        let a = _mm_setr_pi32(1, 2);
+        let expected = _mm_setr_pd(1., 2.);
+        let r = _mm_cvtpi32_pd(a);
+        assert_eq_m128d(r, expected);
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_set_epi64() {
+        let r = _mm_set_epi64(mem::transmute(1i64), mem::transmute(2i64));
+        assert_eq_m128i(r, _mm_setr_epi64x(2, 1));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_set1_epi64() {
+        let r = _mm_set1_epi64(mem::transmute(1i64));
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 1));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_setr_epi64() {
+        let r = _mm_setr_epi64(mem::transmute(1i64), mem::transmute(2i64));
+        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_movepi64_pi64() {
+        let r = _mm_movepi64_pi64(_mm_setr_epi64x(5, 0));
+        assert_eq_m64(r, _mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_movpi64_epi64() {
+        let r = _mm_movpi64_epi64(_mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_cvtpd_pi32() {
+        let a = _mm_setr_pd(5., 0.);
+        let r = _mm_cvtpd_pi32(a);
+        assert_eq_m64(r, _mm_setr_pi32(5, 0));
+    }
+
+    #[simd_test = "sse2,mmx"]
+    unsafe fn test_mm_cvttpd_pi32() {
+        use std::{f64, i32};
+
+        let a = _mm_setr_pd(5., 0.);
+        let r = _mm_cvttpd_pi32(a);
+        assert_eq_m64(r, _mm_setr_pi32(5, 0));
+
+        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
+        let r = _mm_cvttpd_pi32(a);
+        assert_eq_m64(r, _mm_setr_pi32(i32::MIN, i32::MIN));
+    }
 }
--- a/library/stdarch/coresimd/x86/i586/sse3.rs
+++ b/library/stdarch/coresimd/x86/i586/sse3.rs
--- a/library/stdarch/coresimd/x86/i586/sse41.rs
+++ b/library/stdarch/coresimd/x86/i586/sse41.rs
@ -797,6 +797,125 @@ pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
    mem::transmute(constify_imm3!(imm8, call))
 }

+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///            operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestz(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///            operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all ones,
+/// * `0` - otherwise.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestc(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///            operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
+    ptestnzc(a.as_i64x2(), mask.as_i64x2())
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///            operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are all zeros,
+/// * `0` - otherwise.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testz_si128(a, mask)
+}
+
+/// Tests whether the specified bits in `a` 128-bit integer vector are all
+/// ones.
+///
+/// Argument:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+///
+/// Returns:
+///
+/// * `1` - if the bits specified in the operand are all set to 1,
+/// * `0` - otherwise.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(pcmpeqd))]
+#[cfg_attr(test, assert_instr(ptest))]
+pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
+    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// Arguments:
+///
+/// * `a` - A 128-bit integer vector containing the bits to be tested.
+/// * `mask` - A 128-bit integer vector selecting which bits to test in
+///            operand `a`.
+///
+/// Returns:
+///
+/// * `1` - if the specified bits are neither all zeros nor all ones,
+/// * `0` - otherwise.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[cfg_attr(test, assert_instr(ptest))]
+pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
+    _mm_testnzc_si128(a, mask)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
    #[link_name = "llvm.x86.sse41.pblendvb"]
@ -849,6 +968,12 @@ extern "C" {
    fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
    #[link_name = "llvm.x86.sse41.mpsadbw"]
    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.ptestz"]
+    fn ptestz(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestc"]
+    fn ptestc(a: i64x2, mask: i64x2) -> i32;
+    #[link_name = "llvm.x86.sse41.ptestnzc"]
+    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
 }

 #[cfg(test)]
@ -1476,4 +1601,102 @@ mod tests {
        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
        assert_eq_m128i(r, e);
    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn test_mm_testz_si128() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testz_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn test_mm_testc_si128() {
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_testc_si128(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn test_mm_testnzc_si128() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_testnzc_si128(a, mask);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn test_mm_test_all_zeros() {
+        let a = _mm_set1_epi8(1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b011);
+        let mask = _mm_set1_epi8(0b100);
+        let r = _mm_test_all_zeros(a, mask);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn test_mm_test_all_ones() {
+        let a = _mm_set1_epi8(-1);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let r = _mm_test_all_ones(a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn test_mm_test_mix_ones_zeros() {
+        let a = _mm_set1_epi8(0);
+        let mask = _mm_set1_epi8(1);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(-1);
+        let mask = _mm_set1_epi8(0);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b110);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 1);
+        let a = _mm_set1_epi8(0b101);
+        let mask = _mm_set1_epi8(0b101);
+        let r = _mm_test_mix_ones_zeros(a, mask);
+        assert_eq!(r, 0);
+    }
 }
--- a/library/stdarch/coresimd/x86/i586/sse42.rs
+++ b/library/stdarch/coresimd/x86/i586/sse42.rs
@ -5,6 +5,7 @@
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use coresimd::simd_llvm::*;
 use coresimd::v128::*;
 use coresimd::x86::*;

@ -601,6 +602,15 @@ pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
    crc32_32_32(crc, v)
 }

+/// Compare packed 64-bit integers in `a` and `b` for greater-than,
+/// return the results.
+#[inline]
+#[target_feature(enable = "sse4.2")]
+#[cfg_attr(test, assert_instr(pcmpgtq))]
+pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
+    mem::transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
    // SSE 4.2 string and text comparison ops
@ -826,4 +836,15 @@ mod tests {
        let i = _mm_crc32_u32(crc, v);
        assert_eq!(i, 0xffae2ed1);
    }
+
+    #[simd_test = "sse4.2"]
+    unsafe fn test_mm_cmpgt_epi64() {
+        let a = _mm_setr_epi64x(0, 0x2a);
+        let b = _mm_set1_epi64x(0x00);
+        let i = _mm_cmpgt_epi64(a, b);
+        assert_eq_m128i(
+            i,
+            _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64),
+        );
+    }
 }
--- a/library/stdarch/coresimd/x86/i686/sse4a.rs
+++ b/library/stdarch/coresimd/x86/i686/sse4a.rs
--- a/library/stdarch/coresimd/x86/i586/ssse3.rs
+++ b/library/stdarch/coresimd/x86/i586/ssse3.rs
@ -239,6 +239,169 @@ pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
    mem::transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
 }

+/// Compute the absolute value of packed 8-bit integers in `a` and
+/// return the unsigned results.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pabsb))]
+pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
+    pabsb(a)
+}
+
+/// Compute the absolute value of packed 8-bit integers in `a`, and return the
+/// unsigned results.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pabsw))]
+pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
+    pabsw(a)
+}
+
+/// Compute the absolute value of packed 32-bit integers in `a`, and return the
+/// unsigned results.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pabsd))]
+pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
+    pabsd(a)
+}
+
+/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
+/// the corresponding 8-bit element of `b`, and return the results
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pshufb))]
+pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
+    pshufb(a, b)
+}
+
+/// Concatenates the two 64-bit integer vector operands, and right-shifts
+/// the result by the number of bytes specified in the immediate operand.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(palignr, n = 15))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            palignrb(a, b, $imm8)
+        }
+    }
+    constify_imm8!(n, call)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of [4 x i16].
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phaddw))]
+pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
+    phaddw(a, b)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of [2 x i32].
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phaddd))]
+pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
+    phaddd(a, b)
+}
+
+/// Horizontally add the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phaddsw))]
+pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
+    phaddsw(a, b)
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of [4 x i16].
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phsubw))]
+pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
+    phsubw(a, b)
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of [2 x i32].
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phsubd))]
+pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
+    phsubd(a, b)
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(phsubsw))]
+pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
+    phsubsw(a, b)
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, adds pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pmaddubsw))]
+pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
+    pmaddubsw(a, b)
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// products to the 18 most significant bits by right-shifting, rounds the
+/// truncated value by adding 1, and writes bits [16:1] to the destination.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(pmulhrsw))]
+pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
+    pmulhrsw(a, b)
+}
+
+/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b` is
+/// zero.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(psignb))]
+pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
+    psignb(a, b)
+}
+
+/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b` is
+/// zero.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(psignw))]
+pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
+    psignw(a, b)
+}
+
+/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and return the results.
+/// Element in result are zeroed out when the corresponding element in `b` is
+/// zero.
+#[inline]
+#[target_feature(enable = "ssse3,mmx")]
+#[cfg_attr(test, assert_instr(psignd))]
+pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
+    psignd(a, b)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
    #[link_name = "llvm.x86.ssse3.pabs.b.128"]
@ -285,6 +448,54 @@ extern "C" {

    #[link_name = "llvm.x86.ssse3.psign.d.128"]
    fn psignd128(a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.ssse3.pabs.b"]
+    fn pabsb(a: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pabs.w"]
+    fn pabsw(a: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pabs.d"]
+    fn pabsd(a: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pshuf.b"]
+    fn pshufb(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.mmx.palignr.b"]
+    fn palignrb(a: __m64, b: __m64, n: u8) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phadd.w"]
+    fn phaddw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phadd.d"]
+    fn phaddd(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phadd.sw"]
+    fn phaddsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phsub.w"]
+    fn phsubw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phsub.d"]
+    fn phsubd(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.phsub.sw"]
+    fn phsubsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw"]
+    fn pmaddubsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.pmul.hr.sw"]
+    fn pmulhrsw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.psign.b"]
+    fn psignb(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.psign.w"]
+    fn psignw(a: __m64, b: __m64) -> __m64;
+
+    #[link_name = "llvm.x86.ssse3.psign.d"]
+    fn psignd(a: __m64, b: __m64) -> __m64;
 }

 #[cfg(test)]
@ -491,4 +702,138 @@ mod tests {
        let r = _mm_sign_epi32(a, b);
        assert_eq_m128i(r, expected);
    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_abs_pi8() {
+        let r = _mm_abs_pi8(_mm_set1_pi8(-5));
+        assert_eq_m64(r, _mm_set1_pi8(5));
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_abs_pi16() {
+        let r = _mm_abs_pi16(_mm_set1_pi16(-5));
+        assert_eq_m64(r, _mm_set1_pi16(5));
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_abs_pi32() {
+        let r = _mm_abs_pi32(_mm_set1_pi32(-5));
+        assert_eq_m64(r, _mm_set1_pi32(5));
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_shuffle_pi8() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_pi8(4, 128u8 as i8, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_pi8(5, 0, 5, 4, 1, 5, 7, 4);
+        let r = _mm_shuffle_pi8(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_alignr_pi8() {
+        let a = _mm_setr_pi32(0x89ABCDEF_u32 as i32, 0x01234567_u32 as i32);
+        let b = _mm_setr_pi32(0xBBAA9988_u32 as i32, 0xFFDDEECC_u32 as i32);
+        let r = _mm_alignr_pi8(a, b, 4);
+        assert_eq_m64(r, ::std::mem::transmute(0x89abcdefffddeecc_u64));
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_hadd_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 128, 4, 3);
+        let expected = _mm_setr_pi16(3, 7, 132, 7);
+        let r = _mm_hadd_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_hadd_pi32() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(4, 128);
+        let expected = _mm_setr_pi32(3, 132);
+        let r = _mm_hadd_pi32(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_hadds_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(32767, 1, -32768, -1);
+        let expected = _mm_setr_pi16(3, 7, 32767, -32768);
+        let r = _mm_hadds_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_hsub_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 128, 4, 3);
+        let expected = _mm_setr_pi16(-1, -1, -124, 1);
+        let r = _mm_hsub_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_hsub_pi32() {
+        let a = _mm_setr_pi32(1, 2);
+        let b = _mm_setr_pi32(4, 128);
+        let expected = _mm_setr_pi32(-1, -124);
+        let r = _mm_hsub_pi32(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_hsubs_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 128, 4, 3);
+        let expected = _mm_setr_pi16(-1, -1, -124, 1);
+        let r = _mm_hsubs_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_maddubs_pi16() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_pi8(4, 63, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_pi16(130, 24, 192, 194);
+        let r = _mm_maddubs_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_mulhrs_pi16() {
+        let a = _mm_setr_pi16(1, 2, 3, 4);
+        let b = _mm_setr_pi16(4, 32767, -1, -32768);
+        let expected = _mm_setr_pi16(0, 2, 0, -4);
+        let r = _mm_mulhrs_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_sign_pi8() {
+        let a = _mm_setr_pi8(1, 2, 3, 4, -5, -6, 7, 8);
+        let b = _mm_setr_pi8(4, 64, 0, 3, 1, -1, -2, 1);
+        let expected = _mm_setr_pi8(1, 2, 0, 4, -5, 6, -7, 8);
+        let r = _mm_sign_pi8(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_sign_pi16() {
+        let a = _mm_setr_pi16(-1, 2, 3, 4);
+        let b = _mm_setr_pi16(1, -1, 1, 0);
+        let expected = _mm_setr_pi16(-1, -2, 3, 0);
+        let r = _mm_sign_pi16(a, b);
+        assert_eq_m64(r, expected);
+    }
+
+    #[simd_test = "ssse3,mmx"]
+    unsafe fn test_mm_sign_pi32() {
+        let a = _mm_setr_pi32(-1, 2);
+        let b = _mm_setr_pi32(1, 0);
+        let expected = _mm_setr_pi32(-1, 0);
+        let r = _mm_sign_pi32(a, b);
+        assert_eq_m64(r, expected);
+    }
 }
--- a/library/stdarch/coresimd/x86/i586/tbm.rs
+++ b/library/stdarch/coresimd/x86/i586/tbm.rs
@ -263,41 +263,41 @@ pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::i586::tbm;
+    use coresimd::x86::*;

    /*
    #[simd_test = "tbm"]
-    unsafe fn _bextr_u32() {
-        assert_eq!(tbm::_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
+    unsafe fn test_bextr_u32() {
+        assert_eq!(_bextr_u32(0b0101_0000u32, 4, 4), 0b0000_0101u32);
    }

    #[simd_test = "tbm"]
-    unsafe fn _bextr_u64() {
-        assert_eq!(tbm::_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
+    unsafe fn test_bextr_u64() {
+        assert_eq!(_bextr_u64(0b0101_0000u64, 4, 4), 0b0000_0101u64);
    }
    */

    #[simd_test = "tbm"]
-    unsafe fn _blcfill_u32() {
-        assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
-        assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
+    unsafe fn test_blcfill_u32() {
+        assert_eq!(_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
+        assert_eq!(_blcfill_u32(0b1111_1111u32), 0u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
-    unsafe fn _blcfill_u64() {
-        assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
-        assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
+    unsafe fn test_blcfill_u64() {
+        assert_eq!(_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
+        assert_eq!(_blcfill_u64(0b1111_1111u64), 0u64);
    }

    #[simd_test = "tbm"]
-    unsafe fn _blci_u32() {
+    unsafe fn test_blci_u32() {
        assert_eq!(
-            tbm::_blci_u32(0b0101_0000u32),
+            _blci_u32(0b0101_0000u32),
            0b1111_1111_1111_1111_1111_1111_1111_1110u32
        );
        assert_eq!(
-            tbm::_blci_u32(0b1111_1111u32),
+            _blci_u32(0b1111_1111u32),
            0b1111_1111_1111_1111_1111_1110_1111_1111u32
        );
    }
@ -305,61 +305,61 @@ mod tests {
    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    #[cfg_attr(rustfmt, rustfmt_skip)]
-    unsafe fn _blci_u64() {
+    unsafe fn test_blci_u64() {
        assert_eq!(
-            tbm::_blci_u64(0b0101_0000u64),
+            _blci_u64(0b0101_0000u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64
        );
        assert_eq!(
-            tbm::_blci_u64(0b1111_1111u64),
+            _blci_u64(0b1111_1111u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64
        );
    }

    #[simd_test = "tbm"]
-    unsafe fn _blcic_u32() {
-        assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
-        assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
+    unsafe fn test_blcic_u32() {
+        assert_eq!(_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
+        assert_eq!(_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
-    unsafe fn _blcic_u64() {
-        assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
-        assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
+    unsafe fn test_blcic_u64() {
+        assert_eq!(_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
+        assert_eq!(_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
    }

    #[simd_test = "tbm"]
-    unsafe fn _blcmsk_u32() {
-        assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
-        assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    unsafe fn test_blcmsk_u32() {
+        assert_eq!(_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
+        assert_eq!(_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
-    unsafe fn _blcmsk_u64() {
-        assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
-        assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    unsafe fn test_blcmsk_u64() {
+        assert_eq!(_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
+        assert_eq!(_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
    }

    #[simd_test = "tbm"]
-    unsafe fn _blcs_u32() {
-        assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
-        assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
+    unsafe fn test_blcs_u32() {
+        assert_eq!(_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
+        assert_eq!(_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
-    unsafe fn _blcs_u64() {
-        assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
-        assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
+    unsafe fn test_blcs_u64() {
+        assert_eq!(_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
+        assert_eq!(_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
    }

    #[simd_test = "tbm"]
-    unsafe fn _blsfill_u32() {
-        assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
+    unsafe fn test_blsfill_u32() {
+        assert_eq!(_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
        assert_eq!(
-            tbm::_blsfill_u32(0u32),
+            _blsfill_u32(0u32),
            0b1111_1111_1111_1111_1111_1111_1111_1111u32
        );
    }
@ -367,22 +367,22 @@ mod tests {
    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    #[cfg_attr(rustfmt, rustfmt_skip)]
-    unsafe fn _blsfill_u64() {
-        assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
+    unsafe fn test_blsfill_u64() {
+        assert_eq!(_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
        assert_eq!(
-            tbm::_blsfill_u64(0u64),
+            _blsfill_u64(0u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
        );
    }

    #[simd_test = "tbm"]
-    unsafe fn _blsic_u32() {
+    unsafe fn test_blsic_u32() {
        assert_eq!(
-            tbm::_blsic_u32(0b0101_0100u32),
+            _blsic_u32(0b0101_0100u32),
            0b1111_1111_1111_1111_1111_1111_1111_1011u32
        );
        assert_eq!(
-            tbm::_blsic_u32(0u32),
+            _blsic_u32(0u32),
            0b1111_1111_1111_1111_1111_1111_1111_1111u32
        );
    }
@ -390,25 +390,25 @@ mod tests {
    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    #[cfg_attr(rustfmt, rustfmt_skip)]
-    unsafe fn _blsic_u64() {
+    unsafe fn test_blsic_u64() {
        assert_eq!(
-            tbm::_blsic_u64(0b0101_0100u64),
+            _blsic_u64(0b0101_0100u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64
        );
        assert_eq!(
-            tbm::_blsic_u64(0u64),
+            _blsic_u64(0u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
        );
    }

    #[simd_test = "tbm"]
-    unsafe fn _t1mskc_u32() {
+    unsafe fn test_t1mskc_u32() {
        assert_eq!(
-            tbm::_t1mskc_u32(0b0101_0111u32),
+            _t1mskc_u32(0b0101_0111u32),
            0b1111_1111_1111_1111_1111_1111_1111_1000u32
        );
        assert_eq!(
-            tbm::_t1mskc_u32(0u32),
+            _t1mskc_u32(0u32),
            0b1111_1111_1111_1111_1111_1111_1111_1111u32
        );
    }
@ -416,27 +416,27 @@ mod tests {
    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
    #[cfg_attr(rustfmt, rustfmt_skip)]
-    unsafe fn _t1mksc_u64() {
+    unsafe fn test_t1mksc_u64() {
        assert_eq!(
-            tbm::_t1mskc_u64(0b0101_0111u64),
+            _t1mskc_u64(0b0101_0111u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64
        );
        assert_eq!(
-            tbm::_t1mskc_u64(0u64),
+            _t1mskc_u64(0u64),
            0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
        );
    }

    #[simd_test = "tbm"]
-    unsafe fn _tzmsk_u32() {
-        assert_eq!(tbm::_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
-        assert_eq!(tbm::_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
+    unsafe fn test_tzmsk_u32() {
+        assert_eq!(_tzmsk_u32(0b0101_1000u32), 0b0000_0111u32);
+        assert_eq!(_tzmsk_u32(0b0101_1001u32), 0b0000_0000u32);
    }

    #[simd_test = "tbm"]
    #[cfg(not(target_arch = "x86"))]
-    unsafe fn _tzmsk_u64() {
-        assert_eq!(tbm::_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
-        assert_eq!(tbm::_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
+    unsafe fn test_tzmsk_u64() {
+        assert_eq!(_tzmsk_u64(0b0101_1000u64), 0b0000_0111u64);
+        assert_eq!(_tzmsk_u64(0b0101_1001u64), 0b0000_0000u64);
    }
 }
--- a/library/stdarch/coresimd/x86/test.rs
+++ b/library/stdarch/coresimd/x86/test.rs
@ -103,7 +103,7 @@ pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
    A { a }.b[idx]
 }

-// These intrinsics doesn't exist on x86 b/c it requires a 64-bit registe,r
+// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
 // which doesn't exist on x86!
 #[cfg(target_arch = "x86")]
 mod x86_polyfill {
@ -132,5 +132,8 @@ mod x86_polyfill {
        a.a
    }
 }
-#[cfg(target_arch = "x86")]
+#[cfg(target_arch = "x86_64")]
+mod x86_polyfill {
+    pub use coresimd::x86_64::{_mm_insert_epi64, _mm256_insert_epi64};
+}
 pub use self::x86_polyfill::*;
--- a/library/stdarch/coresimd/x86/i586/xsave.rs
+++ b/library/stdarch/coresimd/x86/i586/xsave.rs
@ -54,7 +54,7 @@ pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
 /// `XFEATURE_ENABLED_MASK` for `XCR`
 ///
 /// This intrinsic maps to `XSETBV` instruction.
-const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
+pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;

 /// Copy 64-bits from `val` to the extended control register (`XCR`) specified
 /// by `a`.
@ -141,7 +141,7 @@ mod tests {
    use std::fmt;
    use std::prelude::v1::*;

-    use coresimd::x86::i586::xsave;
+    use coresimd::x86::*;
    use stdsimd_test::simd_test;

    #[repr(align(64))]
@ -194,23 +194,23 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsave(a.ptr(), m);
-        xsave::_xrstor(a.ptr(), m);
-        xsave::_xsave(b.ptr(), m);
+        _xsave(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsave(b.ptr(), m);
        assert_eq!(a, b);
    }
    */

    #[simd_test = "xsave"]
    unsafe fn xgetbv_xsetbv() {
-        let xcr_n: u32 = xsave::_XCR_XFEATURE_ENABLED_MASK;
+        let xcr_n: u32 = _XCR_XFEATURE_ENABLED_MASK;

-        let xcr: u64 = xsave::_xgetbv(xcr_n);
+        let xcr: u64 = _xgetbv(xcr_n);
        // FIXME: XSETBV is a privileged instruction we should only test this
        // when running in privileged mode:
        //
        // _xsetbv(xcr_n, xcr);
-        let xcr_cpy: u64 = xsave::_xgetbv(xcr_n);
+        let xcr_cpy: u64 = _xgetbv(xcr_n);
        assert_eq!(xcr, xcr_cpy);
    }

@ -222,9 +222,9 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsaveopt(a.ptr(), m);
-        xsave::_xrstor(a.ptr(), m);
-        xsave::_xsaveopt(b.ptr(), m);
+        _xsaveopt(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsaveopt(b.ptr(), m);
        assert_eq!(a, b);
    }
    */
@ -237,9 +237,9 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsavec(a.ptr(), m);
-        xsave::_xrstor(a.ptr(), m);
-        xsave::_xsavec(b.ptr(), m);
+        _xsavec(a.ptr(), m);
+        _xrstor(a.ptr(), m);
+        _xsavec(b.ptr(), m);
        assert_eq!(a, b);
    }

@ -251,9 +251,9 @@ mod tests {
        let mut a = XsaveArea::new();
        let mut b = XsaveArea::new();

-        xsave::_xsaves(a.ptr(), m);
-        xsave::_xrstors(a.ptr(), m);
-        xsave::_xsaves(b.ptr(), m);
+        _xsaves(a.ptr(), m);
+        _xrstors(a.ptr(), m);
+        _xsaves(b.ptr(), m);
        assert_eq!(a, b);
    }
    */
--- a/library/stdarch/coresimd/x86/x86_64/abm.rs
+++ b/library/stdarch/coresimd/x86/x86_64/abm.rs
@ -42,7 +42,7 @@ pub unsafe fn _popcnt64(x: i64) -> i32 {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::*;
+    use coresimd::arch::x86_64::*;

    #[simd_test = "lzcnt"]
    unsafe fn test_lzcnt_u64() {
--- a/library/stdarch/coresimd/x86/x86_64/avx.rs
+++ b/library/stdarch/coresimd/x86/x86_64/avx.rs
--- a/library/stdarch/coresimd/x86/x86_64/avx2.rs
+++ b/library/stdarch/coresimd/x86/x86_64/avx2.rs
@ -35,7 +35,7 @@ pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::*;
+    use coresimd::arch::x86_64::*;

    #[simd_test = "avx2"]
    unsafe fn test_mm256_extract_epi64() {
--- a/library/stdarch/coresimd/x86/x86_64/bmi.rs
+++ b/library/stdarch/coresimd/x86/x86_64/bmi.rs
@ -102,6 +102,7 @@ mod tests {
    use stdsimd_test::simd_test;

    use coresimd::x86::*;
+    use coresimd::x86_64::*;

    #[simd_test = "bmi"]
    unsafe fn test_bextr_u64() {
--- a/library/stdarch/coresimd/x86/x86_64/bmi2.rs
+++ b/library/stdarch/coresimd/x86/x86_64/bmi2.rs
@ -69,7 +69,7 @@ extern "C" {
 mod tests {
    use stdsimd_test::simd_test;

-    use coresimd::x86::*;
+    use coresimd::x86_64::*;

    #[simd_test = "bmi2"]
    unsafe fn test_pext_u64() {
--- a/library/stdarch/coresimd/x86/x86_64/bswap.rs
+++ b/library/stdarch/coresimd/x86/x86_64/bswap.rs
--- a/library/stdarch/coresimd/x86/x86_64/fxsr.rs
+++ b/library/stdarch/coresimd/x86/x86_64/fxsr.rs
--- a/library/stdarch/coresimd/x86/x86_64/mod.rs
+++ b/library/stdarch/coresimd/x86/x86_64/mod.rs
--- a/library/stdarch/coresimd/x86/x86_64/rdrand.rs
+++ b/library/stdarch/coresimd/x86/x86_64/rdrand.rs
--- a/library/stdarch/coresimd/x86/x86_64/sse.rs
+++ b/library/stdarch/coresimd/x86/x86_64/sse.rs
@ -66,7 +66,7 @@ mod tests {

    use stdsimd_test::simd_test;

-    use coresimd::x86::*;
+    use coresimd::arch::x86_64::*;

    #[simd_test = "sse"]
    unsafe fn test_mm_cvtss_si64() {
--- a/library/stdarch/coresimd/x86/x86_64/sse2.rs
+++ b/library/stdarch/coresimd/x86/x86_64/sse2.rs
@ -117,7 +117,7 @@ mod tests {

    use stdsimd_test::simd_test;

-    use coresimd::x86::*;
+    use coresimd::arch::x86_64::*;

    #[simd_test = "sse2"]
    unsafe fn test_mm_cvtsd_si64() {
--- a/library/stdarch/coresimd/x86/x86_64/sse41.rs
+++ b/library/stdarch/coresimd/x86/x86_64/sse41.rs
@ -31,7 +31,7 @@ pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;
-    use coresimd::x86::*;
+    use coresimd::arch::x86_64::*;

    #[simd_test = "sse4.1"]
    unsafe fn test_mm_extract_epi64() {
--- a/library/stdarch/coresimd/x86/x86_64/sse42.rs
+++ b/library/stdarch/coresimd/x86/x86_64/sse42.rs
@ -20,7 +20,7 @@ pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {

 #[cfg(test)]
 mod tests {
-    use coresimd::x86::*;
+    use coresimd::arch::x86_64::*;

    use stdsimd_test::simd_test;

--- a/library/stdarch/coresimd/x86/x86_64/xsave.rs
+++ b/library/stdarch/coresimd/x86/x86_64/xsave.rs
--- a/library/stdarch/crates/stdsimd-verify/src/lib.rs
+++ b/library/stdarch/crates/stdsimd-verify/src/lib.rs
@ -22,10 +22,10 @@ macro_rules! my_quote {
 pub fn x86_functions(input: TokenStream) -> TokenStream {
    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
    let root = dir.parent().unwrap();
-    let root = root.join("../coresimd/x86");

    let mut files = Vec::new();
-    walk(&root, &mut files);
+    walk(&root.join("../coresimd/x86"), &mut files);
+    walk(&root.join("../coresimd/x86_64"), &mut files);
    assert!(files.len() > 0);

    let mut functions = Vec::new();