Migrate a bunch of i586::sse2 to native types (#273)

2018-01-10 12:42:26 -06:00 · 2018-01-10 12:42:26 -06:00 · 6d8d2f81e9
commit 6d8d2f81e9
parent baf9d0e7e0
9 changed files with 1574 additions and 1532 deletions
--- a/library/stdarch/coresimd/src/x86/i586/avx.rs
+++ b/library/stdarch/coresimd/src/x86/i586/avx.rs
@ -843,7 +843,7 @@ pub unsafe fn _mm256_extractf128_ps(a: f32x8, imm8: i32) -> __m128 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vextractf128))]
-pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> f64x2 {
+pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> __m128d {
    match imm8 & 1 {
        0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
        _ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]),
@ -1068,9 +1068,7 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
-pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
-    use x86::i586::sse2::_mm_undefined_pd;
-
+pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
    let imm8 = (imm8 & 0xFF) as u8;
    macro_rules! shuffle2 {
        ($a:expr, $b:expr) => {
@ -1194,7 +1192,7 @@ pub unsafe fn _mm256_insertf128_ps(a: f32x8, b: __m128, imm8: i32) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
-pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
+pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: __m128d, imm8: i32) -> f64x4 {
    match imm8 & 1 {
        0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
        _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
@ -2139,7 +2137,7 @@ pub unsafe fn _mm256_castps256_ps128(a: f32x8) -> __m128 {
 #[target_feature = "+avx"]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
-pub unsafe fn _mm256_castpd256_pd128(a: f64x4) -> f64x2 {
+pub unsafe fn _mm256_castpd256_pd128(a: f64x4) -> __m128d {
    simd_shuffle2(a, a, [0, 1])
 }

@ -2171,7 +2169,7 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> f32x8 {
 #[target_feature = "+avx"]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
-pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
+pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> f64x4 {
    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
    simd_shuffle4(a, a, [0, 1, 0, 0])
 }
@ -2221,8 +2219,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 #[target_feature = "+avx,+sse2"]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
-pub unsafe fn _mm256_zextpd128_pd256(a: f64x2) -> f64x4 {
-    use x86::i586::sse2::_mm_setzero_pd;
+pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> f64x4 {
    simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
 }

@ -2326,7 +2323,6 @@ pub unsafe fn _mm256_loadu2_m128(
 pub unsafe fn _mm256_loadu2_m128d(
    hiaddr: *const f64, loaddr: *const f64
 ) -> f64x4 {
-    use x86::i586::sse2::_mm_loadu_pd;
    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
    _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
 }
@ -2371,7 +2367,6 @@ pub unsafe fn _mm256_storeu2_m128(
 pub unsafe fn _mm256_storeu2_m128d(
    hiaddr: *mut f64, loaddr: *mut f64, a: f64x4
 ) {
-    use x86::i586::sse2::_mm_storeu_pd;
    let lo = _mm256_castpd256_pd128(a);
    _mm_storeu_pd(loaddr, lo);
    let hi = _mm256_extractf128_pd(a, 1);
@ -3104,9 +3099,9 @@ mod tests {
    #[simd_test = "avx"]
    unsafe fn test_mm256_extractf128_pd() {
        let a = f64x4::new(4., 3., 2., 5.);
-        let r = avx::_mm256_extractf128_pd(a, 0);
-        let e = f64x2::new(4., 3.);
-        assert_eq!(r, e);
+        let r = _mm256_extractf128_pd(a, 0);
+        let e = _mm_setr_pd(4., 3.);
+        assert_eq_m128d(r, e);
    }

    #[simd_test = "avx"]
@ -3189,10 +3184,10 @@ mod tests {

    #[simd_test = "avx"]
    unsafe fn test_mm_permute_pd() {
-        let a = f64x2::new(4., 3.);
-        let r = avx::_mm_permute_pd(a, 1);
-        let e = f64x2::new(3., 4.);
-        assert_eq!(r, e);
+        let a = _mm_setr_pd(4., 3.);
+        let r = _mm_permute_pd(a, 1);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
    }

    #[simd_test = "avx"]
@ -3271,8 +3266,8 @@ mod tests {
    #[simd_test = "avx"]
    unsafe fn test_mm256_insertf128_pd() {
        let a = f64x4::new(1., 2., 3., 4.);
-        let b = f64x2::new(5., 6.);
-        let r = avx::_mm256_insertf128_pd(a, b, 0);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm256_insertf128_pd(a, b, 0);
        let e = f64x4::new(5., 6., 3., 4.);
        assert_eq!(r, e);
    }
@ -4078,8 +4073,8 @@ mod tests {
    #[simd_test = "avx"]
    unsafe fn test_mm256_castpd256_pd128() {
        let a = f64x4::new(1., 2., 3., 4.);
-        let r = avx::_mm256_castpd256_pd128(a);
-        assert_eq!(r, f64x2::new(1., 2.));
+        let r = _mm256_castpd256_pd128(a);
+        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
    }

    #[simd_test = "avx"]
@ -4107,8 +4102,8 @@ mod tests {

    #[simd_test = "avx"]
    unsafe fn test_mm256_zextpd128_pd256() {
-        let a = f64x2::new(1., 2.);
-        let r = avx::_mm256_zextpd128_pd256(a);
+        let a = _mm_setr_pd(1., 2.);
+        let r = _mm256_zextpd128_pd256(a);
        let e = f64x4::new(1., 2., 0., 0.);
        assert_eq!(r, e);
    }
@ -4271,8 +4266,8 @@ mod tests {
            &mut lo as *mut _ as *mut f64,
            a,
        );
-        assert_eq!(hi, f64x2::new(3., 4.));
-        assert_eq!(lo, f64x2::new(1., 2.));
+        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
+        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
    }

    #[simd_test = "avx"]
--- a/library/stdarch/coresimd/src/x86/i586/sse.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse.rs
@ -1711,20 +1711,6 @@ mod tests {
    use stdsimd_test::simd_test;
    use test::black_box; // Used to inhibit constant-folding.

-    #[target_feature = "+sse"]
-    unsafe fn assert_eq_m128(a: __m128, b: __m128) {
-        let r = _mm_cmpeq_ps(a, b);
-        if _mm_movemask_ps(r) != 0b1111 {
-            panic!("{:?} != {:?}", a, b);
-        }
-    }
-
-    #[target_feature = "+sse"]
-    unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
-        union A { a: __m128, b: [f32; 4] };
-        transmute::<__m128, A>(a).b[idx]
-    }
-
    #[simd_test = "sse"]
    unsafe fn test_mm_add_ps() {
        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
--- a/library/stdarch/coresimd/src/x86/i586/sse2.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse2.rs
--- a/library/stdarch/coresimd/src/x86/i586/sse3.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse3.rs
@ -2,6 +2,7 @@

 use simd_llvm::{simd_shuffle2, simd_shuffle4};
 use v128::*;
+use x86::*;

 #[cfg(test)]
 use stdsimd_test::assert_instr;
@ -84,8 +85,7 @@ pub unsafe fn _mm_movedup_pd(a: f64x2) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse3"]
 #[cfg_attr(test, assert_instr(movddup))]
-pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> f64x2 {
-    use x86::i586::sse2::_mm_load1_pd;
+pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
    _mm_load1_pd(mem_addr)
 }

@ -131,9 +131,10 @@ mod tests {

    use v128::*;
    use x86::i586::sse3;
+    use x86::*;

    #[simd_test = "sse3"]
-    unsafe fn _mm_addsub_ps() {
+    unsafe fn test_mm_addsub_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
        let r = sse3::_mm_addsub_ps(a, b);
@ -141,7 +142,7 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_addsub_pd() {
+    unsafe fn test_mm_addsub_pd() {
        let a = f64x2::new(-1.0, 5.0);
        let b = f64x2::new(-100.0, 20.0);
        let r = sse3::_mm_addsub_pd(a, b);
@ -149,7 +150,7 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_hadd_pd() {
+    unsafe fn test_mm_hadd_pd() {
        let a = f64x2::new(-1.0, 5.0);
        let b = f64x2::new(-100.0, 20.0);
        let r = sse3::_mm_hadd_pd(a, b);
@ -157,7 +158,7 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_hadd_ps() {
+    unsafe fn test_mm_hadd_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
        let r = sse3::_mm_hadd_ps(a, b);
@ -165,7 +166,7 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_hsub_pd() {
+    unsafe fn test_mm_hsub_pd() {
        let a = f64x2::new(-1.0, 5.0);
        let b = f64x2::new(-100.0, 20.0);
        let r = sse3::_mm_hsub_pd(a, b);
@ -173,7 +174,7 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_hsub_ps() {
+    unsafe fn test_mm_hsub_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
        let r = sse3::_mm_hsub_ps(a, b);
@ -181,7 +182,7 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_lddqu_si128() {
+    unsafe fn test_mm_lddqu_si128() {
        #[cfg_attr(rustfmt, rustfmt_skip)]
        let a = __m128i::from(i8x16::new(
            1, 2, 3, 4,
@ -194,30 +195,30 @@ mod tests {
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_movedup_pd() {
+    unsafe fn test_mm_movedup_pd() {
        let a = f64x2::new(-1.0, 5.0);
        let r = sse3::_mm_movedup_pd(a);
        assert_eq!(r, f64x2::new(-1.0, -1.0));
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_movehdup_ps() {
+    unsafe fn test_mm_movehdup_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let r = sse3::_mm_movehdup_ps(a);
        assert_eq!(r, f32x4::new(5.0, 5.0, -10.0, -10.0));
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_moveldup_ps() {
+    unsafe fn test_mm_moveldup_ps() {
        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
        let r = sse3::_mm_moveldup_ps(a);
        assert_eq!(r, f32x4::new(-1.0, -1.0, 0.0, 0.0));
    }

    #[simd_test = "sse3"]
-    unsafe fn _mm_loaddup_pd() {
+    unsafe fn test_mm_loaddup_pd() {
        let d = -5.0;
-        let r = sse3::_mm_loaddup_pd(&d);
-        assert_eq!(r, f64x2::new(d, d));
+        let r = _mm_loaddup_pd(&d);
+        assert_eq_m128d(r, _mm_setr_pd(d, d));
    }
 }
--- a/library/stdarch/coresimd/src/x86/i686/sse41.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse41.rs
@ -1,6 +1,7 @@
 //! `i686`'s Streaming SIMD Extensions 4.1 (SSE4.1)

 use v128::*;
+use x86::*;

 #[cfg(test)]
 use stdsimd_test::assert_instr;
@ -111,8 +112,7 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
-    let b = i32x4::from(a);
-    _mm_testc_si128(a, __m128i::from(::x86::_mm_cmpeq_epi32(b, b)))
+    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
 }

 /// Tests whether the specified bits in a 128-bit integer vector are
--- a/library/stdarch/coresimd/src/x86/mod.rs
+++ b/library/stdarch/coresimd/src/x86/mod.rs
@ -1,5 +1,7 @@
 //! `x86` and `x86_64` intrinsics.

+use core::mem;
+
 #[macro_use]
 mod macros;

@ -8,6 +10,70 @@ mod macros;
 #[allow(non_camel_case_types)]
 pub struct __m128(f32, f32, f32, f32);

+#[repr(simd)]
+#[derive(Clone, Copy, Debug)]
+#[allow(non_camel_case_types)]
+pub struct __m128d(f64, f64);
+
+pub use v128::__m128i;
+pub use v64::__m64;
+
+#[cfg(test)]
+mod test;
+#[cfg(test)]
+pub use self::test::*;
+
+#[doc(hidden)]
+#[allow(non_camel_case_types)]
+trait m128iExt: Sized {
+    fn as_m128i(self) -> __m128i;
+
+    #[inline(always)]
+    fn as_u8x16(self) -> ::v128::u8x16 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_u16x8(self) -> ::v128::u16x8 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_u32x4(self) -> ::v128::u32x4 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_u64x2(self) -> ::v128::u64x2 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_i8x16(self) -> ::v128::i8x16 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_i16x8(self) -> ::v128::i16x8 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_i32x4(self) -> ::v128::i32x4 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+
+    #[inline(always)]
+    fn as_i64x2(self) -> ::v128::i64x2 {
+        unsafe { mem::transmute(self.as_m128i()) }
+    }
+}
+
+impl m128iExt for __m128i {
+    #[inline(always)]
+    fn as_m128i(self) -> __m128i { self }
+}
+
 mod i386;
 pub use self::i386::*;

--- a/library/stdarch/coresimd/src/x86/test.rs
+++ b/library/stdarch/coresimd/src/x86/test.rs
@ -0,0 +1,32 @@
+//! Utilities used in testing the x86 intrinsics
+
+use std::mem;
+
+use x86::*;
+
+#[target_feature = "+sse2"]
+pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature = "+sse2"]
+pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
+    union A { a: __m128d, b: [f64; 2] };
+    mem::transmute::<__m128d, A>(a).b[idx]
+}
+
+#[target_feature = "+sse"]
+pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[target_feature = "+sse"]
+pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
+    union A { a: __m128, b: [f32; 4] };
+    mem::transmute::<__m128, A>(a).b[idx]
+}
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@ -63,9 +63,12 @@
 //! #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 //! #[target_feature = "+sse2"]
 //! unsafe fn sum_sse2(x: i32x4) -> i32 {
-//!     let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x.into(), 8).into());
-//!     let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x.into(), 4).into());
-//!     vendor::_mm_cvtsi128_si32(x)
+//!     use std::mem;
+//!     let x: vendor::__m128i = mem::transmute(x);
+//!     let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x, 8));
+//!     let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x, 4));
+//!     let ret = vendor::_mm_cvtsi128_si32(x);
+//!     mem::transmute(ret)
 //! }
 //!
 //! // Uses the SSE2 version if SSE2 is enabled for all target
--- a/library/stdarch/stdsimd-verify/src/lib.rs
+++ b/library/stdarch/stdsimd-verify/src/lib.rs
@ -98,6 +98,7 @@ fn to_type(t: &syn::Type) -> Tokens {
    match *t {
        syn::Type::Path(ref p) => match extract_path_ident(&p.path).as_ref() {
            "__m128" => my_quote! { &F32x4 },
+            "__m128d" => my_quote! { &F64x2 },
            "__m128i" => my_quote! { &I8x16 },
            "__m256i" => my_quote! { &I8x32 },
            "__m64" => my_quote! { &I8x8 },
@ -178,6 +179,10 @@ fn walk(root: &Path, files: &mut Vec<syn::File>) {
            continue;
        }

+        if path.file_name().and_then(|s| s.to_str()) == Some("test.rs") {
+            continue
+        }
+
        let mut contents = String::new();
        File::open(&path)
            .unwrap()