Add 256-bit vectors and some SSE intrinsics.

2017-03-18 15:21:32 -04:00 · 2017-03-18 15:21:32 -04:00 · 27e307278a
commit 27e307278a
parent df03dc4d80
9 changed files with 687 additions and 69 deletions
--- a/library/stdarch/TODO.md
+++ b/library/stdarch/TODO.md
@ -1,3 +1,167 @@
+**TIP**: Use the following command to generate a section in this list for
+Intel intrinsics. Replace `SSE4.2` with the intended type.
+
+```
+rg '^<intrinsic' intel-intrinsics-3.3.15.xml | rg "'SSE4.2'" | rg '^.*name=\x27([^\x27]+)\x27.*$' -r '* [ ] `$1`' >> TODO.md
+```
+
+
+sse
+---
+* [ ] `_MM_TRANSPOSE4_PS`
+* [ ] `_mm_getcsr`
+* [ ] `_mm_setcsr`
+* [ ] `_MM_GET_EXCEPTION_STATE`
+* [ ] `_MM_SET_EXCEPTION_STATE`
+* [ ] `_MM_GET_EXCEPTION_MASK`
+* [ ] `_MM_SET_EXCEPTION_MASK`
+* [ ] `_MM_GET_ROUNDING_MODE`
+* [ ] `_MM_SET_ROUNDING_MODE`
+* [ ] `_MM_GET_FLUSH_ZERO_MODE`
+* [ ] `_MM_SET_FLUSH_ZERO_MODE`
+* [ ] `_mm_prefetch`
+* [ ] `_mm_sfence`
+* [ ] `_mm_max_pi16`
+* [ ] `_m_pmaxsw`
+* [ ] `_mm_max_pu8`
+* [ ] `_m_pmaxub`
+* [ ] `_mm_min_pi16`
+* [ ] `_m_pminsw`
+* [ ] `_mm_min_pu8`
+* [ ] `_m_pminub`
+* [ ] `_mm_mulhi_pu16`
+* [ ] `_m_pmulhuw`
+* [ ] `_mm_avg_pu8`
+* [ ] `_m_pavgb`
+* [ ] `_mm_avg_pu16`
+* [ ] `_m_pavgw`
+* [ ] `_mm_sad_pu8`
+* [ ] `_m_psadbw`
+* [ ] `_mm_cvtsi32_ss`
+* [ ] `_mm_cvt_si2ss`
+* [ ] `_mm_cvtsi64_ss`
+* [ ] `_mm_cvtpi32_ps`
+* [ ] `_mm_cvt_pi2ps`
+* [ ] `_mm_cvtpi16_ps`
+* [ ] `_mm_cvtpu16_ps`
+* [ ] `_mm_cvtpi8_ps`
+* [ ] `_mm_cvtpu8_ps`
+* [ ] `_mm_cvtpi32x2_ps`
+* [ ] `_mm_stream_pi`
+* [ ] `_mm_maskmove_si64`
+* [ ] `_m_maskmovq`
+* [ ] `_mm_extract_pi16`
+* [ ] `_m_pextrw`
+* [ ] `_mm_insert_pi16`
+* [ ] `_m_pinsrw`
+* [ ] `_mm_movemask_pi8`
+* [ ] `_m_pmovmskb`
+* [ ] `_mm_shuffle_pi16`
+* [ ] `_m_pshufw`
+* [ ] `_mm_add_ss`
+* [ ] `_mm_add_ps`
+* [ ] `_mm_sub_ss`
+* [ ] `_mm_sub_ps`
+* [ ] `_mm_mul_ss`
+* [ ] `_mm_mul_ps`
+* [ ] `_mm_div_ss`
+* [ ] `_mm_div_ps`
+* [ ] `_mm_sqrt_ss`
+* [x] `_mm_sqrt_ps`
+* [ ] `_mm_rcp_ss`
+* [x] `_mm_rcp_ps`
+* [ ] `_mm_rsqrt_ss`
+* [x] `_mm_rsqrt_ps`
+* [ ] `_mm_min_ss`
+* [x] `_mm_min_ps`
+* [ ] `_mm_max_ss`
+* [x] `_mm_max_ps`
+* [ ] `_mm_and_ps`
+* [ ] `_mm_andnot_ps`
+* [ ] `_mm_or_ps`
+* [ ] `_mm_xor_ps`
+* [ ] `_mm_cmpeq_ss`
+* [ ] `_mm_cmpeq_ps`
+* [ ] `_mm_cmplt_ss`
+* [ ] `_mm_cmplt_ps`
+* [ ] `_mm_cmple_ss`
+* [ ] `_mm_cmple_ps`
+* [ ] `_mm_cmpgt_ss`
+* [ ] `_mm_cmpgt_ps`
+* [ ] `_mm_cmpge_ss`
+* [ ] `_mm_cmpge_ps`
+* [ ] `_mm_cmpneq_ss`
+* [ ] `_mm_cmpneq_ps`
+* [ ] `_mm_cmpnlt_ss`
+* [ ] `_mm_cmpnlt_ps`
+* [ ] `_mm_cmpnle_ss`
+* [ ] `_mm_cmpnle_ps`
+* [ ] `_mm_cmpngt_ss`
+* [ ] `_mm_cmpngt_ps`
+* [ ] `_mm_cmpnge_ss`
+* [ ] `_mm_cmpnge_ps`
+* [ ] `_mm_cmpord_ss`
+* [ ] `_mm_cmpord_ps`
+* [ ] `_mm_cmpunord_ss`
+* [ ] `_mm_cmpunord_ps`
+* [ ] `_mm_comieq_ss`
+* [ ] `_mm_comilt_ss`
+* [ ] `_mm_comile_ss`
+* [ ] `_mm_comigt_ss`
+* [ ] `_mm_comige_ss`
+* [ ] `_mm_comineq_ss`
+* [ ] `_mm_ucomieq_ss`
+* [ ] `_mm_ucomilt_ss`
+* [ ] `_mm_ucomile_ss`
+* [ ] `_mm_ucomigt_ss`
+* [ ] `_mm_ucomige_ss`
+* [ ] `_mm_ucomineq_ss`
+* [ ] `_mm_cvtss_si32`
+* [ ] `_mm_cvt_ss2si`
+* [ ] `_mm_cvtss_si64`
+* [ ] `_mm_cvtss_f32`
+* [ ] `_mm_cvtps_pi32`
+* [ ] `_mm_cvt_ps2pi`
+* [ ] `_mm_cvttss_si32`
+* [ ] `_mm_cvtt_ss2si`
+* [ ] `_mm_cvttss_si64`
+* [ ] `_mm_cvttps_pi32`
+* [ ] `_mm_cvtt_ps2pi`
+* [ ] `_mm_cvtps_pi16`
+* [ ] `_mm_cvtps_pi8`
+* [ ] `_mm_set_ss`
+* [ ] `_mm_set1_ps`
+* [ ] `_mm_set_ps1`
+* [ ] `_mm_set_ps`
+* [ ] `_mm_setr_ps`
+* [ ] `_mm_setzero_ps`
+* [ ] `_mm_loadh_pi`
+* [ ] `_mm_loadl_pi`
+* [ ] `_mm_load_ss`
+* [ ] `_mm_load1_ps`
+* [ ] `_mm_load_ps1`
+* [ ] `_mm_load_ps`
+* [ ] `_mm_loadu_ps`
+* [ ] `_mm_loadr_ps`
+* [ ] `_mm_stream_ps`
+* [ ] `_mm_storeh_pi`
+* [ ] `_mm_storel_pi`
+* [ ] `_mm_store_ss`
+* [ ] `_mm_store1_ps`
+* [ ] `_mm_store_ps1`
+* [ ] `_mm_store_ps`
+* [ ] `_mm_storeu_ps`
+* [ ] `_mm_storer_ps`
+* [ ] `_mm_move_ss`
+* [ ] `_mm_shuffle_ps`
+* [ ] `_mm_unpackhi_ps`
+* [ ] `_mm_unpacklo_ps`
+* [ ] `_mm_movehl_ps`
+* [ ] `_mm_movelh_ps`
+* [x] `_mm_movemask_ps`
+* [ ] `_mm_undefined_ps`
+
+
 sse2
 ----
 * [x] `_mm_pause`
@ -221,7 +385,7 @@ sse2
 * [ ] `_mm_storel_pd`
 * [ ] `_mm_unpackhi_pd`
 * [ ] `_mm_unpacklo_pd`
-* [ ] `_mm_movemask_pd`
+* [x] `_mm_movemask_pd`
 * [ ] `_mm_shuffle_pd`
 * [ ] `_mm_move_sd`
 * [ ] `_mm_castpd_ps`
@ -234,6 +398,21 @@ sse2
 * [ ] `_mm_undefined_si128`


+sse3
+----
+* [ ] `_mm_addsub_ps`
+* [ ] `_mm_addsub_pd`
+* [ ] `_mm_hadd_pd`
+* [ ] `_mm_hadd_ps`
+* [ ] `_mm_hsub_pd`
+* [ ] `_mm_hsub_ps`
+* [ ] `_mm_lddqu_si128`
+* [ ] `_mm_movedup_pd`
+* [ ] `_mm_loaddup_pd`
+* [ ] `_mm_movehdup_ps`
+* [ ] `_mm_moveldup_ps`
+
+
 ssse3
 -----
 * [ ] `_mm_abs_pi8`
@ -268,3 +447,91 @@ ssse3
 * [ ] `_mm_sign_pi8`
 * [ ] `_mm_sign_pi16`
 * [ ] `_mm_sign_pi32`
+
+
+sse4.1
+------
+* [ ] `_mm_blend_pd`
+* [ ] `_mm_blend_ps`
+* [ ] `_mm_blendv_pd`
+* [ ] `_mm_blendv_ps`
+* [ ] `_mm_blendv_epi8`
+* [ ] `_mm_blend_epi16`
+* [ ] `_mm_dp_pd`
+* [ ] `_mm_dp_ps`
+* [ ] `_mm_extract_ps`
+* [ ] `_mm_extract_epi8`
+* [ ] `_mm_extract_epi32`
+* [ ] `_mm_extract_epi64`
+* [ ] `_mm_insert_ps`
+* [ ] `_mm_insert_epi8`
+* [ ] `_mm_insert_epi32`
+* [ ] `_mm_insert_epi64`
+* [ ] `_mm_max_epi8`
+* [ ] `_mm_max_epi32`
+* [ ] `_mm_max_epu32`
+* [ ] `_mm_max_epu16`
+* [ ] `_mm_min_epi8`
+* [ ] `_mm_min_epi32`
+* [ ] `_mm_min_epu32`
+* [ ] `_mm_min_epu16`
+* [ ] `_mm_packus_epi32`
+* [ ] `_mm_cmpeq_epi64`
+* [ ] `_mm_cvtepi8_epi16`
+* [ ] `_mm_cvtepi8_epi32`
+* [ ] `_mm_cvtepi8_epi64`
+* [ ] `_mm_cvtepi16_epi32`
+* [ ] `_mm_cvtepi16_epi64`
+* [ ] `_mm_cvtepi32_epi64`
+* [ ] `_mm_cvtepu8_epi16`
+* [ ] `_mm_cvtepu8_epi32`
+* [ ] `_mm_cvtepu8_epi64`
+* [ ] `_mm_cvtepu16_epi32`
+* [ ] `_mm_cvtepu16_epi64`
+* [ ] `_mm_cvtepu32_epi64`
+* [ ] `_mm_mul_epi32`
+* [ ] `_mm_mullo_epi32`
+* [ ] `_mm_testz_si128`
+* [ ] `_mm_testc_si128`
+* [ ] `_mm_testnzc_si128`
+* [ ] `_mm_test_all_zeros`
+* [ ] `_mm_test_mix_ones_zeros`
+* [ ] `_mm_test_all_ones`
+* [ ] `_mm_round_pd`
+* [ ] `_mm_floor_pd`
+* [ ] `_mm_ceil_pd`
+* [ ] `_mm_round_ps`
+* [ ] `_mm_floor_ps`
+* [ ] `_mm_ceil_ps`
+* [ ] `_mm_round_sd`
+* [ ] `_mm_floor_sd`
+* [ ] `_mm_ceil_sd`
+* [ ] `_mm_round_ss`
+* [ ] `_mm_floor_ss`
+* [ ] `_mm_ceil_ss`
+* [ ] `_mm_minpos_epu16`
+* [ ] `_mm_mpsadbw_epu8`
+* [ ] `_mm_stream_load_si128`
+
+
+sse4.2
+------
+* [ ] `_mm_cmpistrm`
+* [ ] `_mm_cmpistri`
+* [ ] `_mm_cmpistrz`
+* [ ] `_mm_cmpistrc`
+* [ ] `_mm_cmpistrs`
+* [ ] `_mm_cmpistro`
+* [ ] `_mm_cmpistra`
+* [ ] `_mm_cmpestrm`
+* [ ] `_mm_cmpestri`
+* [ ] `_mm_cmpestrz`
+* [ ] `_mm_cmpestrc`
+* [ ] `_mm_cmpestrs`
+* [ ] `_mm_cmpestro`
+* [ ] `_mm_cmpestra`
+* [ ] `_mm_cmpgt_epi64`
+* [ ] `_mm_crc32_u8`
+* [ ] `_mm_crc32_u16`
+* [ ] `_mm_crc32_u32`
+* [ ] `_mm_crc32_u64`
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@ -5,6 +5,7 @@
 )]

 pub use v128::*;
+pub use v256::*;
 pub use v64::*;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub use x86::*;
@ -13,6 +14,7 @@ pub use x86::*;
 mod macros;
 mod simd;
 mod v128;
+mod v256;
 mod v64;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
--- a/library/stdarch/src/macros.rs
+++ b/library/stdarch/src/macros.rs
@ -8,8 +8,10 @@ macro_rules! define_ty {
 }

 macro_rules! define_impl {
-    ($name:ident, $elemty:ident, $nelems:expr,
-     $($elname:ident),+) => {
+    (
+        $name:ident, $elemty:ident, $nelems:expr, $boolname:ident,
+        $($elname:ident),+
+    ) => {
        impl $name {
            #[inline]
            pub fn new($($elname: $elemty),*) -> $name {
@ -25,25 +27,46 @@ macro_rules! define_impl {
                }),*)
            }

-            #[inline(always)]
+            #[inline]
            pub fn extract(self, idx: u32) -> $elemty {
                assert!(idx < $nelems);
                unsafe { simd_extract(self, idx) }
            }

-            #[inline(always)]
-            pub fn insert(self, idx: u32, val: $elemty) -> $name {
+            #[inline]
+            pub fn replace(self, idx: u32, val: $elemty) -> $name {
                assert!(idx < $nelems);
                unsafe { simd_insert(self, idx, val) }
            }

-            #[inline(always)]
+            #[inline]
+            pub fn store(self, slice: &mut [$elemty], offset: usize) {
+                assert!(slice[offset..].len() >= $nelems);
+                unsafe { self.store_unchecked(slice, offset) }
+            }
+
+            #[inline]
+            pub unsafe fn store_unchecked(
+                self,
+                slice: &mut [$elemty],
+                offset: usize,
+            ) {
+                use std::mem::size_of;
+                use std::ptr;
+
+                ptr::copy_nonoverlapping(
+                    &self as *const $name as *const u8,
+                    slice.get_unchecked_mut(offset) as *mut $elemty as *mut u8,
+                    size_of::<$name>());
+            }
+
+            #[inline]
            pub fn load(slice: &[$elemty], offset: usize) -> $name {
                assert!(slice[offset..].len() >= $nelems);
                unsafe { $name::load_unchecked(slice, offset) }
            }

-            #[inline(always)]
+            #[inline]
            pub unsafe fn load_unchecked(
                slice: &[$elemty],
                offset: usize,
@ -58,6 +81,36 @@ macro_rules! define_impl {
                    size_of::<$name>());
                x
            }
+
+            #[inline]
+            pub fn eq(self, other: $name) -> $boolname {
+                unsafe { simd_eq(self, other) }
+            }
+
+            #[inline]
+            pub fn ne(self, other: $name) -> $boolname {
+                unsafe { simd_ne(self, other) }
+            }
+
+            #[inline]
+            pub fn lt(self, other: $name) -> $boolname {
+                unsafe { simd_lt(self, other) }
+            }
+
+            #[inline]
+            pub fn le(self, other: $name) -> $boolname {
+                unsafe { simd_le(self, other) }
+            }
+
+            #[inline]
+            pub fn gt(self, other: $name) -> $boolname {
+                unsafe { simd_gt(self, other) }
+            }
+
+            #[inline]
+            pub fn ge(self, other: $name) -> $boolname {
+                unsafe { simd_ge(self, other) }
+            }
        }
    }
 }
@ -177,3 +230,15 @@ macro_rules! define_integer_ops {
        )+
    }
 }
+
+macro_rules! define_casts {
+    ($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => {
+        $(
+            impl $ty {
+                pub fn $floatcast(self) -> ::$floatty {
+                    unsafe { simd_cast(self) }
+                }
+            }
+        )+
+    }
+}
--- a/library/stdarch/src/v128.rs
+++ b/library/stdarch/src/v128.rs
@ -1,34 +1,34 @@
 use simd::*;

 define_ty! { f64x2, f64, f64 }
-define_impl! { f64x2, f64, 2, x0, x1 }
+define_impl! { f64x2, f64, 2, i64x2, x0, x1 }

 define_ty! { f32x4, f32, f32, f32, f32 }
-define_impl! { f32x4, f32, 4, x0, x1, x2, x3 }
+define_impl! { f32x4, f32, 4, i32x4, x0, x1, x2, x3 }

 define_ty! { u64x2, u64, u64 }
-define_impl! { u64x2, u64, 2, x0, x1 }
+define_impl! { u64x2, u64, 2, i64x2, x0, x1 }

 define_ty! { i64x2, i64, i64 }
-define_impl! { i64x2, i64, 2, x0, x1 }
+define_impl! { i64x2, i64, 2, i64x2, x0, x1 }

 define_ty! { u32x4, u32, u32, u32, u32 }
-define_impl! { u32x4, u32, 4, x0, x1, x2, x3 }
+define_impl! { u32x4, u32, 4, i32x4, x0, x1, x2, x3 }

 define_ty! { i32x4, i32, i32, i32, i32 }
-define_impl! { i32x4, i32, 4, x0, x1, x2, x3 }
+define_impl! { i32x4, i32, 4, i32x4, x0, x1, x2, x3 }

 define_ty! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
-define_impl! { u16x8, u16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
+define_impl! { u16x8, u16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }

 define_ty! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 }
-define_impl! { i16x8, i16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
+define_impl! { i16x8, i16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }

 define_ty! {
    u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
 }
 define_impl! {
-    u8x16, u8, 16,
+    u8x16, u8, 16, i8x16,
    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
 }

@ -36,7 +36,7 @@ define_ty! {
    i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
 }
 define_impl! {
-    i8x16, i8, 16,
+    i8x16, i8, 16, i8x16,
    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
 }

@ -61,3 +61,22 @@ define_integer_ops!(
    (i16x8, i16),
    (u8x16, u8),
    (i8x16, i8));
+define_casts!(
+    (f64x2, f32x2, as_f32x2),
+    (f64x2, u64x2, as_u64x2),
+    (f64x2, i64x2, as_i64x2),
+    (f32x4, f64x4, as_f64x4),
+    (f32x4, u32x4, as_u32x4),
+    (f32x4, i32x4, as_i32x4),
+    (u64x2, f64x2, as_f64x2),
+    (u64x2, i64x2, as_i64x2),
+    (i64x2, f64x2, as_f64x2),
+    (i64x2, u64x2, as_u64x2),
+    (u32x4, f32x4, as_f32x4),
+    (u32x4, i32x4, as_i32x4),
+    (i32x4, f32x4, as_f32x4),
+    (i32x4, u32x4, as_u32x4),
+    (u16x8, i16x8, as_i16x8),
+    (i16x8, u16x8, as_u16x8),
+    (u8x16, i8x16, as_i8x16),
+    (i8x16, u8x16, as_u8x16));
--- a/library/stdarch/src/v256.rs
+++ b/library/stdarch/src/v256.rs
@ -0,0 +1,105 @@
+use simd::*;
+
+define_ty! { f64x4, f64, f64, f64, f64 }
+define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 }
+
+define_ty! { f32x8, f32, f32, f32, f32, f32, f32, f32, f32 }
+define_impl! { f32x8, f32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! { u64x4, u64, u64, u64, u64 }
+define_impl! { u64x4, u64, 4, i64x4, x0, x1, x2, x3 }
+
+define_ty! { i64x4, i64, i64, i64, i64 }
+define_impl! { i64x4, i64, 4, i64x4, x0, x1, x2, x3 }
+
+define_ty! { u32x8, u32, u32, u32, u32, u32, u32, u32, u32 }
+define_impl! { u32x8, u32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! { i32x8, i32, i32, i32, i32, i32, i32, i32, i32 }
+define_impl! { i32x8, i32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! {
+    u16x16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16
+}
+define_impl! {
+    u16x16, u16, 16, i16x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! {
+    i16x16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16
+}
+define_impl! {
+    i16x16, i16, 16, i16x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! {
+    u8x32,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
+}
+define_impl! {
+    u8x32, u8, 32, i8x32,
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31
+}
+
+define_ty! {
+    i8x32,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
+}
+define_impl! {
+    i8x32, i8, 32, i8x32,
+    x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31
+}
+
+define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
+define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
+define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32);
+define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32);
+define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32);
+define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32);
+define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
+define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);
+
+define_common_ops!(
+    f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
+define_float_ops!(f64x4, f32x8);
+define_integer_ops!(
+    (u64x4, u64),
+    (i64x4, i64),
+    (u32x8, u32),
+    (i32x8, i32),
+    (u16x16, u16),
+    (i16x16, i16),
+    (u8x32, u8),
+    (i8x32, i8));
+define_casts!(
+    (f64x4, f32x4, as_f32x4),
+    (f64x4, u64x4, as_u64x4),
+    (f64x4, i64x4, as_i64x4),
+    (f32x8, u32x8, as_u32x8),
+    (f32x8, i32x8, as_i32x8),
+    (u64x4, f64x4, as_f64x4),
+    (u64x4, i64x4, as_i64x4),
+    (i64x4, f64x4, as_f64x4),
+    (i64x4, u64x4, as_u64x4),
+    (u32x8, f32x8, as_f32x8),
+    (u32x8, i32x8, as_i32x8),
+    (i32x8, f32x8, as_f32x8),
+    (i32x8, u32x8, as_u32x8),
+    (u16x16, i16x16, as_i16x16),
+    (i16x16, u16x16, as_u16x16),
+    (u8x32, i8x32, as_i8x32),
+    (i8x32, u8x32, as_u8x32));
--- a/library/stdarch/src/v64.rs
+++ b/library/stdarch/src/v64.rs
@ -1,25 +1,25 @@
 use simd::*;

 define_ty! { f32x2, f32, f32 }
-define_impl! { f32x2, f32, 2, x0, x1 }
+define_impl! { f32x2, f32, 2, i32x2, x0, x1 }

 define_ty! { u32x2, u32, u32 }
-define_impl! { u32x2, u32, 2, x0, x1 }
+define_impl! { u32x2, u32, 2, i32x2, x0, x1 }

 define_ty! { i32x2, i32, i32 }
-define_impl! { i32x2, i32, 2, x0, x1 }
+define_impl! { i32x2, i32, 2, i32x2, x0, x1 }

 define_ty! { u16x4, u16, u16, u16, u16 }
-define_impl! { u16x4, u16, 4, x0, x1, x2, x3 }
+define_impl! { u16x4, u16, 4, i16x4, x0, x1, x2, x3 }

 define_ty! { i16x4, i16, i16, i16, i16 }
-define_impl! { i16x4, i16, 4, x0, x1, x2, x3 }
+define_impl! { i16x4, i16, 4, i16x4, x0, x1, x2, x3 }

 define_ty! { u8x8, u8, u8, u8, u8, u8, u8, u8, u8 }
-define_impl! { u8x8, u8, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
+define_impl! { u8x8, u8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }

 define_ty! { i8x8, i8, i8, i8, i8, i8, i8, i8, i8 }
-define_impl! { i8x8, i8, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
+define_impl! { i8x8, i8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }

 define_from!(u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
 define_from!(i32x2, u32x2, u16x4, i16x4, u8x8, i8x8);
@ -37,3 +37,15 @@ define_integer_ops!(
    (i16x4, i16),
    (u8x8, u8),
    (i8x8, i8));
+define_casts!(
+    (f32x2, f64x2, as_f64x2),
+    (f32x2, u32x2, as_u32x2),
+    (f32x2, i32x2, as_i32x2),
+    (u32x2, f32x2, as_f32x2),
+    (u32x2, i32x2, as_i32x2),
+    (i32x2, f32x2, as_f32x2),
+    (i32x2, u32x2, as_u32x2),
+    (u16x4, i16x4, as_i16x4),
+    (i16x4, u16x4, as_u16x4),
+    (u8x8, i8x8, as_i8x8),
+    (i8x8, u8x8, as_u8x8));
--- a/library/stdarch/src/x86/mod.rs
+++ b/library/stdarch/src/x86/mod.rs
@ -1,4 +1,4 @@
-// pub use self::sse::*;
+pub use self::sse::*;
 pub use self::sse2::*;
 pub use self::ssse3::*;
 pub use self::sse42::*;
@ -6,7 +6,7 @@ pub use self::sse42::*;
 #[allow(non_camel_case_types)]
 pub type __m128i = ::v128::i8x16;

-// mod sse;
+mod sse;
 mod sse2;
 mod ssse3;
 mod sse42;
--- a/library/stdarch/src/x86/sse.rs
+++ b/library/stdarch/src/x86/sse.rs
@ -0,0 +1,128 @@
+use v128::*;
+
+/// Return the square root of packed single-precision (32-bit) floating-point
+/// elements in `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
+    unsafe { sqrtps(a) }
+}
+
+/// Return the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
+    unsafe { rcpps(a) }
+}
+
+/// Return the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
+    unsafe { rsqrtps(a) }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding minimum values.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { minps(a, b) }
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`, and return the corresponding maximum values.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { maxps(a, b) }
+}
+
+/// Return a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 4 least significant bits of the return value.
+/// All other bits are set to `0`.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub fn _mm_movemask_ps(a: f32x4) -> i32 {
+    unsafe { movmskps(a) }
+}
+
+#[allow(improper_ctypes)]
+extern {
+    #[link_name = "llvm.x86.sse.sqrt.ps"]
+    fn sqrtps(a: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.rcp.ps"]
+    fn rcpps(a: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.rsqrt.ps"]
+    fn rsqrtps(a: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.min.ps"]
+    fn minps(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.max.ps"]
+    fn maxps(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.movmsk.ps"]
+    fn movmskps(a: f32x4) -> i32;
+}
+
+#[cfg(test)]
+mod tests {
+    use v128::*;
+    use x86::sse;
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_sqrt_ps() {
+        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
+        let r = sse::_mm_sqrt_ps(a);
+        let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
+        assert_eq!(r, e);
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_rcp_ps() {
+        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
+        let r = sse::_mm_rcp_ps(a);
+        let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
+        assert_eq!(r, e);
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_rsqrt_ps() {
+        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
+        let r = sse::_mm_rsqrt_ps(a);
+        let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
+        assert_eq!(r, e);
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_min_ps() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_min_ps(a, b);
+        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_max_ps() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_max_ps(a, b);
+        assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_movemask_ps() {
+        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0101);
+
+        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
+        assert_eq!(r, 0b0111);
+    }
+}
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@ -2,7 +2,9 @@ use std::mem;
 use std::os::raw::c_void;
 use std::ptr;

-use simd::*;
+use simd::{
+    simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
+};
 use x86::__m128i;
 use v128::*;
 use v64::*;
@ -519,63 +521,63 @@ pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe { simd_eq(a, b) }
+    a.eq(b)
 }

 /// Compare packed 16-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { simd_eq(a, b) }
+    a.eq(b)
 }

 /// Compare packed 32-bit integers in `a` and `b` for equality.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    unsafe { simd_eq(a, b) }
+    a.eq(b)
 }

 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe { simd_gt(a, b) }
+    a.gt(b)
 }

 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { simd_gt(a, b) }
+    a.gt(b)
 }

 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    unsafe { simd_gt(a, b) }
+    a.gt(b)
 }

 /// Compare packed 8-bit integers in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
-    unsafe { simd_lt(a, b) }
+    a.lt(b)
 }

 /// Compare packed 16-bit integers in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
-    unsafe { simd_lt(a, b) }
+    a.lt(b)
 }

 /// Compare packed 32-bit integers in `a` and `b` for less-than.
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
-    unsafe { simd_lt(a, b) }
+    a.lt(b)
 }

 /// Convert the lower two packed 32-bit integers in `a` to packed
@ -591,7 +593,7 @@ pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2  {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
-    a.insert(0, b as f64)
+    a.replace(0, b as f64)
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
@ -599,7 +601,7 @@ pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
-    a.insert(0, b as f64)
+    a.replace(0, b as f64)
 }

 /// Return `a` with its lower element replaced by `b` after converting it to
@ -842,7 +844,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_move_epi64(a: i64x2) -> i64x2 {
-    a.insert(1, 0)
+    a.replace(1, 0)
 }

 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
@ -880,7 +882,7 @@ pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
-    a.insert(imm8 as u32 & 0b111, i as i16)
+    a.replace(imm8 as u32 & 0b111, i as i16)
 }

 /// Return a mask of the most significant bit of each element in `a`.
@ -1134,7 +1136,7 @@ pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
-    a.insert(0, a.extract(0) + b.extract(0))
+    a.replace(0, a.extract(0) + b.extract(0))
 }

 /// Add packed double-precision (64-bit) floating-point elements in `a` and
@ -1150,7 +1152,7 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
-    a.insert(0, a.extract(0) / b.extract(0))
+    a.replace(0, a.extract(0) / b.extract(0))
 }

 /// Divide packed double-precision (64-bit) floating-point elements in `a` by
@ -1198,7 +1200,7 @@ pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
-    a.insert(0, a.extract(0) * b.extract(0))
+    a.replace(0, a.extract(0) * b.extract(0))
 }

 /// Multiply packed double-precision (64-bit) floating-point elements in `a`
@ -1214,7 +1216,7 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
-    a.insert(0, unsafe { sqrtsd(b).extract(0) })
+    a.replace(0, unsafe { sqrtsd(b).extract(0) })
 }

 /// Return a new vector with the square root of each of the values in `a`.
@ -1229,7 +1231,7 @@ pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
-    a.insert(0, a.extract(0) - b.extract(0))
+    a.replace(0, a.extract(0) - b.extract(0))
 }

 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
@ -1314,7 +1316,7 @@ pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
-    _mm_cmplt_sd(b, a).insert(1, a.extract(1))
+    _mm_cmplt_sd(b, a).replace(1, a.extract(1))
 }

 /// Return a new vector with the low element of `a` replaced by the
@ -1322,7 +1324,7 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
-    _mm_cmple_sd(b, a).insert(1, a.extract(1))
+    _mm_cmple_sd(b, a).replace(1, a.extract(1))
 }

 /// Return a new vector with the low element of `a` replaced by the result
@ -1373,7 +1375,7 @@ pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
-    _mm_cmpnlt_sd(b, a).insert(1, a.extract(1))
+    _mm_cmpnlt_sd(b, a).replace(1, a.extract(1))
 }

 /// Return a new vector with the low element of `a` replaced by the
@ -1381,7 +1383,7 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse2"]
 pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
-    _mm_cmpnle_sd(b, a).insert(1, a.extract(1))
+    _mm_cmpnle_sd(b, a).replace(1, a.extract(1))
 }

 /// Compare corresponding elements in `a` and `b` for equality.
@ -1553,8 +1555,15 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
    unsafe { mem::transmute(ucomineqsd(a, b) as u8) }
 }

-
-
+/// Return a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub fn _mm_movemask_pd(a: f64x2) -> i32 {
+    unsafe { movmskpd(a) }
+}



@ -1703,6 +1712,8 @@ extern {
    fn ucomigesd(a: f64x2, b: f64x2) -> i32;
    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
    fn ucomineqsd(a: f64x2, b: f64x2) -> i32;
+    #[link_name = "llvm.x86.sse2.movmsk.pd"]
+    fn movmskpd(a: f64x2) -> i32;
 }

 #[cfg(test)]
@ -2306,7 +2317,7 @@ mod tests {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0);
        let r = sse2::_mm_cmpeq_epi16(a, b);
-        assert_eq!(r, i16x8::splat(0).insert(2, 0xFFFFu16 as i16));
+        assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16));
    }

    #[test]
@ -2314,55 +2325,55 @@ mod tests {
        let a = i32x4::new(0, 1, 2, 3);
        let b = i32x4::new(3, 2, 2, 0);
        let r = sse2::_mm_cmpeq_epi32(a, b);
-        assert_eq!(r, i32x4::splat(0).insert(2, 0xFFFFFFFFu32 as i32));
+        assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
    }

    #[test]
    fn _mm_cmpgt_epi8() {
-        let a = i8x16::splat(0).insert(0, 5);
+        let a = i8x16::splat(0).replace(0, 5);
        let b = i8x16::splat(0);
        let r = sse2::_mm_cmpgt_epi8(a, b);
-        assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8));
+        assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
    }

    #[test]
    fn _mm_cmpgt_epi16() {
-        let a = i16x8::splat(0).insert(0, 5);
+        let a = i16x8::splat(0).replace(0, 5);
        let b = i16x8::splat(0);
        let r = sse2::_mm_cmpgt_epi16(a, b);
-        assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16));
+        assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
    }

    #[test]
    fn _mm_cmpgt_epi32() {
-        let a = i32x4::splat(0).insert(0, 5);
+        let a = i32x4::splat(0).replace(0, 5);
        let b = i32x4::splat(0);
        let r = sse2::_mm_cmpgt_epi32(a, b);
-        assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32));
+        assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

    #[test]
    fn _mm_cmplt_epi8() {
        let a = i8x16::splat(0);
-        let b = i8x16::splat(0).insert(0, 5);
+        let b = i8x16::splat(0).replace(0, 5);
        let r = sse2::_mm_cmplt_epi8(a, b);
-        assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8));
+        assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
    }

    #[test]
    fn _mm_cmplt_epi16() {
        let a = i16x8::splat(0);
-        let b = i16x8::splat(0).insert(0, 5);
+        let b = i16x8::splat(0).replace(0, 5);
        let r = sse2::_mm_cmplt_epi16(a, b);
-        assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16));
+        assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
    }

    #[test]
    fn _mm_cmplt_epi32() {
        let a = i32x4::splat(0);
-        let b = i32x4::splat(0).insert(0, 5);
+        let b = i32x4::splat(0).replace(0, 5);
        let r = sse2::_mm_cmplt_epi32(a, b);
-        assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32));
+        assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
    }

    #[test]
@ -2504,12 +2515,12 @@ mod tests {
    #[test]
    fn _mm_maskmoveu_si128() {
        let a = i8x16::splat(9);
-        let mask = i8x16::splat(0).insert(2, 0x80u8 as i8);
+        let mask = i8x16::splat(0).replace(2, 0x80u8 as i8);
        let mut r = i8x16::splat(0);
        unsafe {
            sse2::_mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8);
        }
-        assert_eq!(r, i8x16::splat(0).insert(2, 9));
+        assert_eq!(r, i8x16::splat(0).replace(2, 9));
    }

    #[test]
@ -2586,7 +2597,7 @@ mod tests {
    #[test]
    fn _mm_insert_epi16() {
        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.insert(0, 9));
+        assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9));
    }

    #[test]
@ -3207,4 +3218,13 @@ mod tests {
        let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
        assert!(!sse2::_mm_ucomineq_sd(a, b));
    }
+
+    #[test]
+    fn _mm_movemask_pd() {
+        let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0));
+        assert_eq!(r, 0b01);
+
+        let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0));
+        assert_eq!(r, 0b11);
+    }
 }