progress

2016-11-27 01:06:39 -05:00 · 2016-11-27 01:06:39 -05:00 · 12121fc2bb
commit 12121fc2bb
parent c709196f7a
8 changed files with 867 additions and 81 deletions
--- a/library/stdarch/Cargo.toml
+++ b/library/stdarch/Cargo.toml
@ -12,3 +12,4 @@ license = "MIT"

 [profile.release]
 debug = true
+opt-level = 3
--- a/library/stdarch/examples/play.rs
+++ b/library/stdarch/examples/play.rs
@ -1,28 +1,24 @@
 extern crate stdsimd;

 use std::env;
-use std::io::Write;

 use stdsimd as s;

 fn main() {
-    let arg1: f64 = env::args().nth(1).unwrap().parse().unwrap();
-    let arg2: f64 = env::args().nth(2).unwrap().parse().unwrap();
-    let arg3: f64 = env::args().nth(3).unwrap().parse().unwrap();
-    let arg4: f64 = env::args().nth(4).unwrap().parse().unwrap();
+    let arg1: u8 = env::args().nth(1).unwrap().parse().unwrap();
+    let arg2: u8 = env::args().nth(2).unwrap().parse().unwrap();
+    let arg3: u8 = env::args().nth(3).unwrap().parse().unwrap();
+    let arg4: u8 = env::args().nth(4).unwrap().parse().unwrap();
    unsafe {
-        let a1 = s::_mm_load_pd(&(arg1, arg2) as *const _ as *const f64);
-        let b1 = s::_mm_load_pd(&(arg3, arg4) as *const _ as *const f64);
-        // println!("{:?}, {:?}", a, b);
-        let r1 = s::_mm_add_sd(a1, b1);
-        // println!("{:?}", r1);
-        let mut r2: (f64, f64) = (0.0, 0.0);
-        s::_mm_store_pd(&mut r2 as *mut _ as *mut f64, r1);
-        if r2 == (4.0, 2.0) {
-            ::std::io::stdout().write_all(b"yes\n").unwrap();
-        } else {
-            ::std::io::stdout().write_all(b"NO\n").unwrap();
-        }
-        // println!("{:?}", r2);
+        s::_mm_lfence();
+        s::_mm_pause();
+        let a = s::u8x16::new(
+            arg1, arg1, arg1, arg1, arg1, arg1, arg1, arg1,
+            arg2, arg2, arg2, arg2, arg2, arg2, arg2, arg2);
+        let b = s::u8x16::new(
+            arg3, arg3, arg3, arg3, arg3, arg3, arg3, arg3,
+            arg4, arg4, arg4, arg4, arg4, arg4, arg4, arg4);
+        let r = s::_mm_sad_epu8(a.as_m128i(), b.as_m128i());
+        println!("{:?}", s::u64x2::from(r));
    }
 }
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@ -1,36 +1,14 @@
 #![allow(dead_code)]
-#![feature(platform_intrinsics, repr_simd)]
+#![feature(link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi)]

+// pub use v128::{__m128, __m128d, __m128i};
+pub use v128::*;
+pub use v64::__m64;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub use x86::*;

+mod simd;
+mod v128;
+mod v64;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
-
-extern "platform-intrinsic" {
-    fn simd_eq<T, U>(x: T, y: T) -> U;
-    fn simd_ne<T, U>(x: T, y: T) -> U;
-    fn simd_lt<T, U>(x: T, y: T) -> U;
-    fn simd_le<T, U>(x: T, y: T) -> U;
-    fn simd_gt<T, U>(x: T, y: T) -> U;
-    fn simd_ge<T, U>(x: T, y: T) -> U;
-
-    fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
-    fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
-    fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
-    fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
-
-    fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
-    fn simd_extract<T, U>(x: T, idx: u32) -> U;
-
-    fn simd_cast<T, U>(x: T) -> U;
-
-    fn simd_add<T>(x: T, y: T) -> T;
-    fn simd_sub<T>(x: T, y: T) -> T;
-    fn simd_mul<T>(x: T, y: T) -> T;
-    fn simd_div<T>(x: T, y: T) -> T;
-    fn simd_shl<T>(x: T, y: T) -> T;
-    fn simd_shr<T>(x: T, y: T) -> T;
-    fn simd_and<T>(x: T, y: T) -> T;
-    fn simd_or<T>(x: T, y: T) -> T;
-    fn simd_xor<T>(x: T, y: T) -> T;
-}
--- a/library/stdarch/src/simd.rs
+++ b/library/stdarch/src/simd.rs
@ -0,0 +1,28 @@
+extern "platform-intrinsic" {
+    pub fn simd_eq<T, U>(x: T, y: T) -> U;
+    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+    pub fn simd_lt<T, U>(x: T, y: T) -> U;
+    pub fn simd_le<T, U>(x: T, y: T) -> U;
+    pub fn simd_gt<T, U>(x: T, y: T) -> U;
+    pub fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+
+    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+    pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
+
+    pub fn simd_cast<T, U>(x: T) -> U;
+
+    pub fn simd_add<T>(x: T, y: T) -> T;
+    pub fn simd_sub<T>(x: T, y: T) -> T;
+    pub fn simd_mul<T>(x: T, y: T) -> T;
+    pub fn simd_div<T>(x: T, y: T) -> T;
+    pub fn simd_shl<T>(x: T, y: T) -> T;
+    pub fn simd_shr<T>(x: T, y: T) -> T;
+    pub fn simd_and<T>(x: T, y: T) -> T;
+    pub fn simd_or<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+}
--- a/library/stdarch/src/v128.rs
+++ b/library/stdarch/src/v128.rs
@ -0,0 +1,160 @@
+use std::mem::transmute;
+
+use simd::*;
+
+macro_rules! define_ty {
+    ($name:ident, $($elty:ident),+) => {
+        #[repr(simd)]
+        #[derive(Clone, Copy, Debug)]
+        #[allow(non_camel_case_types)]
+        pub struct $name($($elty),*);
+    }
+}
+
+macro_rules! define_ty_internal {
+    ($name:ident, $($elty:ident),+) => {
+        #[repr(simd)]
+        #[derive(Clone, Copy, Debug, PartialEq)]
+        #[allow(non_camel_case_types)]
+        pub struct $name($($elty),*);
+    }
+}
+
+macro_rules! define_impl {
+    ($name:ident, $boolname:ident, $elemty:ident, $nelems:expr,
+     $($elname:ident),+) => {
+        impl From<__m128> for $name {
+            #[inline]
+            fn from(v: __m128) -> $name { unsafe { transmute(v) } }
+        }
+
+        impl From<__m128i> for $name {
+            #[inline]
+            fn from(v: __m128i) -> $name { unsafe { transmute(v) } }
+        }
+
+        impl From<__m128d> for $name {
+            #[inline]
+            fn from(v: __m128d) -> $name { unsafe { transmute(v) } }
+        }
+
+        impl $name {
+            #[inline]
+            pub fn new($($elname: $elemty),*) -> $name {
+                $name($($elname),*)
+            }
+
+            #[inline]
+            pub fn splat(value: $elemty) -> $name {
+                $name($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elname;
+                    value
+                }),*)
+            }
+
+            #[inline]
+            pub fn eq(self, other: $name) -> $boolname {
+                unsafe { simd_eq(self, other) }
+            }
+
+            #[inline]
+            pub fn ne(self, other: $name) -> $boolname {
+                unsafe { simd_ne(self, other) }
+            }
+
+            #[inline]
+            pub fn lt(self, other: $name) -> $boolname {
+                unsafe { simd_lt(self, other) }
+            }
+
+            #[inline]
+            pub fn le(self, other: $name) -> $boolname {
+                unsafe { simd_le(self, other) }
+            }
+
+            #[inline]
+            pub fn gt(self, other: $name) -> $boolname {
+                unsafe { simd_gt(self, other) }
+            }
+
+            #[inline]
+            pub fn ge(self, other: $name) -> $boolname {
+                unsafe { simd_ge(self, other) }
+            }
+
+            #[inline]
+            pub unsafe fn extract(self, idx: u32) -> $elemty {
+                debug_assert!(idx < $nelems);
+                simd_extract(self, idx)
+            }
+
+            #[inline]
+            pub unsafe fn insert(self, idx: u32, val: $elemty) -> $name {
+                debug_assert!(idx < $nelems);
+                simd_insert(self, idx, val)
+            }
+
+            #[inline]
+            pub fn as_m128(self) -> __m128 { unsafe { transmute(self) } }
+            #[inline]
+            pub fn as_m128d(self) -> __m128d { unsafe { transmute(self) } }
+            #[inline]
+            pub fn as_m128i(self) -> __m128i { unsafe { transmute(self) } }
+            #[inline]
+            pub fn as_f32x4(self) -> f32x4 { unsafe { transmute(self) } }
+            #[inline]
+            pub fn as_f64x2(self) -> f64x2 { unsafe { transmute(self) } }
+            #[inline]
+            pub fn as_u8x16(self) -> u8x16 { unsafe { transmute(self) } }
+        }
+    }
+}
+
+define_ty! { __m128, f32, f32, f32, f32 }
+define_ty! { __m128d, f64, f64 }
+define_ty! { __m128i, u64, u64 }
+
+define_ty_internal! { boolu64x2, u64, u64 }
+define_ty_internal! { boolu32x4, u32, u32, u32, u32 }
+define_ty_internal! { boolu16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
+define_ty_internal! {
+    boolu8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
+}
+
+define_ty_internal! { f64x2, f64, f64 }
+define_impl! { f64x2, boolu64x2, f64, 2, x0, x1 }
+
+define_ty_internal! { f32x4, f32, f32, f32, f32 }
+define_impl! { f32x4, boolu32x4, f32, 2, x0, x1, x2, x3 }
+
+define_ty_internal! { u64x2, u64, u64 }
+define_impl! { u64x2, boolu64x2, u64, 2, x0, x1 }
+
+define_ty_internal! { u32x4, u32, u32, u32, u32 }
+define_impl! { u32x4, boolu32x4, u32, 4, x0, x1, x2, x3 }
+
+define_ty_internal! { i32x4, i32, i32, i32, i32 }
+define_impl! { i32x4, boolu32x4, i32, 4, x0, x1, x2, x3 }
+
+define_ty_internal! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
+define_impl! { u16x8, boolu16x8, u16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty_internal! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 }
+define_impl! { i16x8, boolu16x8, i16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty_internal! {
+    u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
+}
+define_impl! {
+    u8x16, boolu8x16, u8, 16,
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty_internal! {
+    i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
+}
+define_impl! {
+    i8x16, boolu8x16, i8, 16,
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+}
--- a/library/stdarch/src/v64.rs
+++ b/library/stdarch/src/v64.rs
@ -0,0 +1,105 @@
+use std::mem::transmute;
+
+use simd::*;
+
+macro_rules! define_ty {
+    ($name:ident, $($elty:ident),+) => {
+        #[repr(simd)]
+        #[derive(Clone, Copy, Debug)]
+        #[allow(non_camel_case_types)]
+        pub struct $name($($elty),*);
+    }
+}
+
+macro_rules! define_ty_internal {
+    ($name:ident, $($elty:ident),+) => {
+        #[repr(simd)]
+        #[derive(Clone, Copy, Debug, PartialEq)]
+        #[allow(non_camel_case_types)]
+        pub struct $name($($elty),*);
+    }
+}
+
+macro_rules! define_impl {
+    ($name:ident, $boolname:ident, $elemty:ident, $nelems:expr,
+     $($elname:ident),+) => {
+        impl From<__m64> for $name {
+            #[inline]
+            fn from(v: __m64) -> $name { unsafe { transmute(v) } }
+        }
+
+        impl $name {
+            #[inline]
+            pub fn new($($elname: $elemty),*) -> $name {
+                $name($($elname),*)
+            }
+
+            #[inline]
+            pub fn splat(value: $elemty) -> $name {
+                $name($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elname;
+                    value
+                }),*)
+            }
+
+            #[inline]
+            pub fn eq(self, other: $name) -> $boolname {
+                unsafe { simd_eq(self, other) }
+            }
+
+            #[inline]
+            pub fn ne(self, other: $name) -> $boolname {
+                unsafe { simd_ne(self, other) }
+            }
+
+            #[inline]
+            pub fn lt(self, other: $name) -> $boolname {
+                unsafe { simd_lt(self, other) }
+            }
+
+            #[inline]
+            pub fn le(self, other: $name) -> $boolname {
+                unsafe { simd_le(self, other) }
+            }
+
+            #[inline]
+            pub fn gt(self, other: $name) -> $boolname {
+                unsafe { simd_gt(self, other) }
+            }
+
+            #[inline]
+            pub fn ge(self, other: $name) -> $boolname {
+                unsafe { simd_ge(self, other) }
+            }
+
+            #[inline]
+            pub unsafe fn extract(self, idx: u32) -> $elemty {
+                debug_assert!(idx < $nelems);
+                simd_extract(self, idx)
+            }
+
+            #[inline]
+            pub unsafe fn insert(self, idx: u32, val: $elemty) -> $name {
+                debug_assert!(idx < $nelems);
+                simd_insert(self, idx, val)
+            }
+
+            #[inline]
+            pub fn as_m64(self) -> __m64 { unsafe { transmute(self) } }
+            #[inline]
+            pub fn as_u64(self) -> u64 { unsafe { transmute(self) } }
+        }
+    }
+}
+
+define_ty! { __m64, u64 }
+
+define_ty_internal! { boolu64x1, u64 }
+define_ty_internal! { boolu32x2, u32, u32 }
+
+define_ty_internal! { u64x1, u64 }
+define_impl! { u64x1, boolu64x1, u64, 1, x0 }
+
+define_ty_internal! { u32x2, u32, u32 }
+define_impl! { u32x2, boolu32x2, u32, 2, x0, x1 }
--- a/library/stdarch/src/x86/mod.rs
+++ b/library/stdarch/src/x86/mod.rs
@ -1,29 +1,5 @@
+pub use self::sse::*;
 pub use self::sse2::*;

 mod sse;
 mod sse2;
-
-#[repr(simd)]
-#[derive(Clone, Copy, Debug)]
-#[allow(non_camel_case_types)]
-pub struct __m128(f32, f32, f32, f32);
-
-#[repr(simd)]
-#[derive(Clone, Copy, Debug)]
-#[allow(non_camel_case_types)]
-pub struct __m128d(f64, f64);
-
-#[repr(simd)]
-#[derive(Clone, Copy, Debug)]
-#[allow(non_camel_case_types)]
-pub struct __m128i(u64, u64);
-
-#[repr(simd)]
-#[derive(Clone, Copy, Debug)]
-#[allow(non_camel_case_types)]
-pub struct f64x2(f64, f64);
-
-#[repr(simd)]
-#[derive(Clone, Copy, Debug)]
-#[allow(non_camel_case_types)]
-struct u8x16(u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8);
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@ -1,26 +1,568 @@
-use std::mem::transmute;
+use std::os::raw::c_void;

-use super::{__m128d, __m128i, f64x2, u8x16};
-use {simd_add, simd_extract, simd_insert};
+use simd::*;
+use v128::*;
+use v64::*;

+/// Provide a hint to the processor that the code sequence is a spin-wait loop.
+///
+/// This can help improve the performance and power consumption of spin-wait
+/// loops.
+#[inline]
+pub unsafe fn _mm_pause() {
+    pause()
+}
+
+/// Invalidate and flush the cache line that contains `p` from all levels of
+/// the cache hierarchy.
+#[inline]
+pub unsafe fn _mm_clflush(p: *mut c_void) {
+    clflush(p)
+}
+
+/// Perform a serializing operation on all load-from-memory instructions
+/// that were issued prior to this instruction.
+///
+/// Guarantees that every load instruction that precedes, in program order, is
+/// globally visible before any load instruction which follows the fence in
+/// program order.
+#[inline]
+pub unsafe fn _mm_lfence() {
+    lfence()
+}
+
+/// Perform a serializing operation on all load-from-memory and store-to-memory
+/// instructions that were issued prior to this instruction.
+///
+/// Guarantees that every memory access that precedes, in program order, the
+/// memory fence instruction is globally visible before any memory instruction
+/// which follows the fence in program order.
+#[inline]
+pub unsafe fn _mm_mfence() {
+    mfence()
+}
+
+/// Add packed 8-bit integers in "a" and "b", and return the results.
+#[inline]
 pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_add::<u8x16>(transmute(a), transmute(b)))
+    simd_add(u8x16::from(a), u8x16::from(b)).as_m128i()
 }

+/// Add packed 16-bit integers in "a" and "b", and return the results.
+#[inline]
+pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(u16x8::from(a), u16x8::from(b)).as_m128i()
+}
+
+/// Add packed 32-bit integers in "a" and "b", and return the results.
+#[inline]
+pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(u32x4::from(a), u32x4::from(b)).as_m128i()
+}
+
+/// Add 64-bit integers "a" and "b", and return the results.
+#[inline]
+unsafe fn _mm_add_si64(_a: __m64, _b: __m64) -> __m64 {
+    unimplemented!()
+}
+
+/// Add packed 64-bit integers in "a" and "b", and return the results.
+#[inline]
+pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(u64x2::from(a), u64x2::from(b)).as_m128i()
+}
+
+/// Add packed 8-bit integers in "a" and "b" using saturation, and return the
+/// results.
+#[inline]
+pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    paddsb(i8x16::from(a), i8x16::from(b)).as_m128i()
+}
+
+/// Add packed 16-bit integers in "a" and "b" using saturation, and return the
+/// results.
+#[inline]
+pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    paddsw(i16x8::from(a), i16x8::from(b)).as_m128i()
+}
+
+/// Add packed unsigned 8-bit integers in "a" and "b" using saturation, and
+/// return  the results.
+#[inline]
+pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    paddsub(u8x16::from(a), u8x16::from(b)).as_m128i()
+}
+
+/// Add packed unsigned 16-bit integers in "a" and "b" using saturation, and
+/// return the results.
+#[inline]
+pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    paddsuw(u16x8::from(a), u16x8::from(b)).as_m128i()
+}
+
+/// Average packed unsigned 8-bit integers in "a" and "b", and return the
+/// results.
+#[inline]
+pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    pavgb(u8x16::from(a), u8x16::from(b)).as_m128i()
+}
+
+/// Average packed unsigned 16-bit integers in "a" and "b", and return the
+/// results.
+#[inline]
+pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    pavgw(u16x8::from(a), u16x8::from(b)).as_m128i()
+}
+
+/// Multiply packed signed 16-bit integers in "a" and "b", producing
+/// intermediate signed 32-bit integers.
+///
+/// Horizontally add adjacent pairs of intermediate 32-bit integers, and pack
+/// the results in "dst".
+#[inline]
+pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmaddwd(i16x8::from(a), i16x8::from(b)).as_m128i()
+}
+
+/// Compare packed 16-bit integers in `a` and `b`, and return the packed
+/// maximum values.
+#[inline]
+pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmaxsw(i16x8::from(a), i16x8::from(b)).as_m128i()
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
+/// packed maximum values.
+#[inline]
+pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    pmaxub(u8x16::from(a), u8x16::from(b)).as_m128i()
+}
+
+/// Compare packed 16-bit integers in `a` and `b`, and return the packed
+/// minimum values.
+#[inline]
+pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pminsw(i16x8::from(a), i16x8::from(b)).as_m128i()
+}
+
+/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
+/// packed minimum values.
+#[inline]
+pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    pminub(u8x16::from(a), u8x16::from(b)).as_m128i()
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+#[inline]
+pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmulhw(i16x8::from(a), i16x8::from(b)).as_m128i()
+}
+
+/// Multiply the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+#[inline]
+pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    pmulhuw(u16x8::from(a), u16x8::from(b)).as_m128i()
+}
+
+/// Multiply the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+#[inline]
+pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    simd_mul(i16x8::from(a), i16x8::from(b)).as_m128i()
+}
+
+/// Multiply the low unsigned 32-bit integers from `a` and `b`.
+///
+/// Return the unsigned 64-bit result.
+#[inline]
+unsafe fn _mm_mul_su32(_a: __m64, _b: __m64) -> __m64 {
+    unimplemented!()
+}
+
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Return the unsigned 64-bit results.
+#[inline]
+pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    pmuludq(u32x4::from(a), u32x4::from(b)).as_m128i()
+}
+
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+#[inline]
+pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    psadbw(u8x16::from(a), u8x16::from(b)).as_m128i()
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#[inline]
 pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
-    let alow = simd_extract::<f64x2, f64>(transmute(a), 0);
-    let blow = simd_extract::<f64x2, f64>(transmute(b), 0);
-    transmute(simd_insert::<f64x2, f64>(transmute(a), 0, alow + blow))
+    let (a, b) = (f64x2::from(a), f64x2::from(b));
+    a.insert(0, a.extract(0) + b.extract(0)).as_m128d()
 }

+#[inline]
 pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(simd_add::<f64x2>(transmute(a), transmute(b)))
+    simd_add(f64x2::from(a), f64x2::from(b)).as_m128d()
 }

+#[inline]
 pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
    *(mem_addr as *const __m128d)
 }

+#[inline]
 pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
    *(mem_addr as *mut __m128d) = a;
 }
+
+#[allow(improper_ctypes)]
+extern {
+    #[link_name = "llvm.x86.sse2.pause"]
+    pub fn pause();
+    #[link_name = "llvm.x86.sse2.clflush"]
+    pub fn clflush(p: *mut c_void);
+    #[link_name = "llvm.x86.sse2.lfence"]
+    pub fn lfence();
+    #[link_name = "llvm.x86.sse2.mfence"]
+    pub fn mfence();
+    #[link_name = "llvm.x86.sse2.padds.b"]
+    pub fn paddsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse2.padds.w"]
+    pub fn paddsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.paddus.b"]
+    pub fn paddsub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.paddus.w"]
+    pub fn paddsuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pavg.b"]
+    pub fn pavgb(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pavg.w"]
+    pub fn pavgw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pmadd.wd"]
+    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.x86.sse2.pmaxs.w"]
+    pub fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pmaxu.b"]
+    pub fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmins.w"]
+    pub fn pminsw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pminu.b"]
+    pub fn pminub(a: u8x16, b: u8x16) -> u8x16;
+    #[link_name = "llvm.x86.sse2.pmulh.w"]
+    pub fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.x86.sse2.pmulhu.w"]
+    pub fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse2.pmulu.dq"]
+    pub fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
+    #[link_name = "llvm.x86.sse2.psad.bw"]
+    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2;
+}
+
+#[cfg(test)]
+mod tests {
+    use std::os::raw::c_void;
+
+    use v128::*;
+    use v64::*;
+    use x86::sse2 as sse2;
+
+    #[test]
+    fn _mm_pause() {
+        unsafe { sse2::_mm_pause() }
+    }
+
+    #[test]
+    fn _mm_clflush() {
+        let x = 0;
+        unsafe { sse2::_mm_clflush(&x as *const _ as *mut c_void) }
+    }
+
+    #[test]
+    fn _mm_lfence() {
+        unsafe { sse2::_mm_lfence() }
+    }
+
+    #[test]
+    fn _mm_mfence() {
+        unsafe { sse2::_mm_mfence() }
+    }
+
+    #[test]
+    fn _mm_add_epi8() {
+        let a = u8x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = u8x16::new(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = unsafe { sse2::_mm_add_epi8(a.as_m128i(), b.as_m128i()) };
+        let e = u8x16::new(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
+        assert_eq!(u8x16::from(r), e);
+    }
+
+    #[test]
+    fn _mm_adds_epi8_overflow() {
+        let a = u8x16::splat(0xFF);
+        let b = u8x16::splat(1);
+        let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u8x16::from(r), u8x16::splat(0));
+    }
+
+    #[test]
+    fn _mm_add_epi16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = unsafe { sse2::_mm_add_epi16(a.as_m128i(), b.as_m128i()) };
+        let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq!(u16x8::from(r), e);
+    }
+
+    #[test]
+    fn _mm_add_epi32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let b = u32x4::new(4, 5, 6, 7);
+        let r = unsafe { sse2::_mm_add_epi32(a.as_m128i(), b.as_m128i()) };
+        let e = u32x4::new(4, 6, 8, 10);
+        assert_eq!(u32x4::from(r), e);
+    }
+
+    #[test]
+    #[ignore]
+    fn _mm_add_si64() {
+        let (a, b) = (u64x1::new(1), u64x1::new(2));
+        let r = unsafe { sse2::_mm_add_si64(a.as_m64(), b.as_m64()) };
+        let e = u64x1::new(3);
+        assert_eq!(u64x1::from(r), e);
+    }
+
+    #[test]
+    fn _mm_add_epi64() {
+        let a = u64x2::new(0, 1);
+        let b = u64x2::new(2, 3);
+        let r = unsafe { sse2::_mm_add_epi64(a.as_m128i(), b.as_m128i()) };
+        let e = u64x2::new(2, 4);
+        assert_eq!(u64x2::from(r), e);
+    }
+
+    #[test]
+    fn _mm_adds_epi8() {
+        let a = i8x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = i8x16::new(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
+        let e = i8x16::new(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
+        assert_eq!(i8x16::from(r), e);
+    }
+
+    #[test]
+    fn _mm_adds_epi8_saturate_positive() {
+        let a = i8x16::splat(0x7F);
+        let b = i8x16::splat(1);
+        let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i8x16::from(r), a);
+    }
+
+    #[test]
+    fn _mm_adds_epi8_saturate_negative() {
+        let a = i8x16::splat(-0x80);
+        let b = i8x16::splat(-1);
+        let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i8x16::from(r), a);
+    }
+
+    #[test]
+    fn _mm_adds_epi16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = unsafe { sse2::_mm_adds_epi16(a.as_m128i(), b.as_m128i()) };
+        let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq!(i16x8::from(r), e);
+    }
+
+    #[test]
+    fn _mm_adds_epi16_saturate_positive() {
+        let a = i16x8::splat(0x7FFF);
+        let b = i16x8::splat(1);
+        let r = unsafe { sse2::_mm_adds_epi16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i16x8::from(r), a);
+    }
+
+    #[test]
+    fn _mm_adds_epi16_saturate_negative() {
+        let a = i16x8::splat(-0x8000);
+        let b = i16x8::splat(-1);
+        let r = unsafe { sse2::_mm_adds_epi16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i16x8::from(r), a);
+    }
+
+    #[test]
+    fn _mm_adds_epu8() {
+        let a = u8x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = u8x16::new(
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = unsafe { sse2::_mm_adds_epu8(a.as_m128i(), b.as_m128i()) };
+        let e = u8x16::new(
+            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
+        assert_eq!(u8x16::from(r), e);
+    }
+
+    #[test]
+    fn _mm_adds_epu8_saturate() {
+        let a = u8x16::splat(0xFF);
+        let b = u8x16::splat(1);
+        let r = unsafe { sse2::_mm_adds_epu8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u8x16::from(r), a);
+    }
+
+    #[test]
+    fn _mm_adds_epu16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = unsafe { sse2::_mm_adds_epu16(a.as_m128i(), b.as_m128i()) };
+        let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
+        assert_eq!(u16x8::from(r), e);
+    }
+
+    #[test]
+    fn _mm_adds_epu16_saturate() {
+        let a = u16x8::splat(0xFFFF);
+        let b = u16x8::splat(1);
+        let r = unsafe { sse2::_mm_adds_epu16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u16x8::from(r), a);
+    }
+
+    #[test]
+    fn _mm_avg_epu8() {
+        let (a, b) = (u8x16::splat(3), u8x16::splat(9));
+        let r = unsafe { sse2::_mm_avg_epu8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u8x16::from(r), u8x16::splat(6));
+    }
+
+    #[test]
+    fn _mm_avg_epu16() {
+        let (a, b) = (u16x8::splat(3), u16x8::splat(9));
+        let r = unsafe { sse2::_mm_avg_epu8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u16x8::from(r), u16x8::splat(6));
+    }
+
+    #[test]
+    fn _mm_madd_epi16() {
+        let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = unsafe { sse2::_mm_madd_epi16(a.as_m128i(), b.as_m128i()) };
+        let e = i32x4::new(29, 81, 149, 233);
+        assert_eq!(i32x4::from(r), e);
+    }
+
+    #[test]
+    fn _mm_max_epi16() {
+        let a = i16x8::splat(1);
+        let b = i16x8::splat(-1);
+        let r = unsafe { sse2::_mm_max_epi16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i16x8::from(r), a);
+    }
+
+    #[test]
+    fn _mm_max_epu8() {
+        let a = u8x16::splat(1);
+        let b = u8x16::splat(255);
+        let r = unsafe { sse2::_mm_max_epu8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u8x16::from(r), b);
+    }
+
+    #[test]
+    fn _mm_min_epi16() {
+        let a = i16x8::splat(1);
+        let b = i16x8::splat(-1);
+        let r = unsafe { sse2::_mm_min_epi16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i16x8::from(r), b);
+    }
+
+    #[test]
+    fn _mm_min_epu8() {
+        let a = u8x16::splat(1);
+        let b = u8x16::splat(255);
+        let r = unsafe { sse2::_mm_min_epu8(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u8x16::from(r), a);
+    }
+
+    #[test]
+    fn _mm_mulhi_epi16() {
+        let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
+        let r = unsafe { sse2::_mm_mulhi_epi16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i16x8::from(r), i16x8::splat(-16));
+    }
+
+    #[test]
+    fn _mm_mulhi_epu16() {
+        let (a, b) = (u16x8::splat(1000), u16x8::splat(1001));
+        let r = unsafe { sse2::_mm_mulhi_epu16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(u16x8::from(r), u16x8::splat(15));
+    }
+
+    #[test]
+    fn _mm_mullo_epi16() {
+        let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
+        let r = unsafe { sse2::_mm_mullo_epi16(a.as_m128i(), b.as_m128i()) };
+        assert_eq!(i16x8::from(r), i16x8::splat(-17960));
+    }
+
+    #[test]
+    #[ignore]
+    fn _mm_mul_su32() {
+        let a = u32x2::new(1_000_000_000, 3);
+        let b = u32x2::new(1_000_000_000, 4);
+        let r = unsafe { sse2::_mm_mul_su32(a.as_m64(), b.as_m64()) };
+        let e = u64x1::new(1_000_000_000 * 1_000_000_000);
+        assert_eq!(u64x1::from(r), e);
+    }
+
+    #[test]
+    fn _mm_mul_epu32() {
+        let a = u64x2::new(1_000_000_000, 1 << 34);
+        let b = u64x2::new(1_000_000_000, 1 << 35);
+        let r = unsafe { sse2::_mm_mul_epu32(a.as_m128i(), b.as_m128i()) };
+        let e = u64x2::new(1_000_000_000 * 1_000_000_000, 0);
+        assert_eq!(u64x2::from(r), e);
+    }
+
+    #[test]
+    fn _mm_sad_epu8() {
+        let a = u8x16::new(
+            255, 254, 253, 252, 1, 2, 3, 4,
+            155, 154, 153, 152, 1, 2, 3, 4);
+        let b = u8x16::new(
+            0, 0, 0, 0, 2, 1, 2, 1,
+            1, 1, 1, 1, 1, 2, 1, 2);
+        let r = unsafe { sse2::_mm_sad_epu8(a.as_m128i(), b.as_m128i()) };
+        let e = u64x2::new(1020, 614);
+        assert_eq!(u64x2::from(r), e);
+    }
+}