add mmx module, mmx run-time detection, intrinsics (#220)

* [sse] _mm_cvtps_pi32, _mm_cvt_ps2pi * [mmx] run-time detection support * [x86] add mmx module * [x86] make __m64 public * [sse] add _mm_cvtps_pi{8,16}, _mm_cvttps_pi32, _mm_cvtt_ps2pi * move new intrinsics from i586 to i686 module * mmx requires i686
2017-11-28 16:45:41 +01:00 · 2017-11-28 16:45:41 +01:00 · 288a30a93e
commit 288a30a93e
parent ef847ac83b
6 changed files with 205 additions and 16 deletions
--- a/library/stdarch/coresimd/src/runtime/x86.rs
+++ b/library/stdarch/coresimd/src/runtime/x86.rs
@ -29,6 +29,9 @@ use super::bit;
 #[macro_export]
 #[doc(hidden)]
 macro_rules! __unstable_detect_feature {
+    ("mmx") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::mmx{})  };
    ("sse") => {
        $crate::vendor::__unstable_detect_feature(
            $crate::vendor::__Feature::sse{})  };
@ -165,6 +168,8 @@ macro_rules! __unstable_detect_feature {
 #[allow(non_camel_case_types)]
 #[repr(u8)]
 pub enum __Feature {
+    /// MMX
+    mmx,
    /// SSE (Streaming SIMD Extensions)
    sse,
    /// SSE2 (Streaming SIMD Extensions 2)
@ -332,6 +337,7 @@ pub fn detect_features() -> usize {
        enable(proc_info_ecx, 20, __Feature::sse4_2);
        enable(proc_info_ecx, 23, __Feature::popcnt);
        enable(proc_info_edx, 24, __Feature::fxsr);
+        enable(proc_info_edx, 23, __Feature::mmx);
        enable(proc_info_edx, 25, __Feature::sse);
        enable(proc_info_edx, 26, __Feature::sse2);

--- a/library/stdarch/coresimd/src/x86/i586/sse.rs
+++ b/library/stdarch/coresimd/src/x86/i586/sse.rs
@ -626,10 +626,6 @@ pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 {
    _mm_cvtss_si32(a)
 }

-// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
-// pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2
-// pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 { _mm_cvtps_pi32(a) }
-
 /// Convert the lowest 32 bit float in the input vector to a 32 bit integer
 /// with
 /// truncation.
@ -655,10 +651,6 @@ pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 {
    _mm_cvttss_si32(a)
 }

-// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
-// pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2;
-// pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 { _mm_cvttps_pi32(a) }
-
 /// Extract the lowest 32 bit float from the input vector.
 #[inline(always)]
 #[target_feature = "+sse"]
--- a/library/stdarch/coresimd/src/x86/i686/mmx.rs
+++ b/library/stdarch/coresimd/src/x86/i686/mmx.rs
@ -0,0 +1,88 @@
+//! `i586` MMX instruction set.
+//!
+//! The intrinsics here roughly correspond to those in the `mmintrin.h` C
+//! header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use v64::{i16x4, i32x2, i8x8};
+use x86::__m64;
+use core::mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Constructs a 64-bit integer vector initialized to zero.
+#[inline(always)]
+#[target_feature = "+mmx,+sse"]
+// FIXME: this produces a movl instead of xorps on x86
+// FIXME: this produces a xor intrinsic instead of xorps on x86_64
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(xor))]
+pub unsafe fn _mm_setzero_si64() -> __m64 {
+    mem::transmute(0_i64)
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
+/// less than 0x80 are saturated to 0x80.
+#[inline(always)]
+#[target_feature = "+mmx,+sse"]
+#[cfg_attr(test, assert_instr(packsswb))]
+pub unsafe fn _mm_packs_pi16(a: i16x4, b: i16x4) -> i8x8 {
+    mem::transmute(packsswb(mem::transmute(a), mem::transmute(b)))
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
+/// less than 0x80 are saturated to 0x80.
+#[inline(always)]
+#[target_feature = "+mmx,+sse"]
+#[cfg_attr(test, assert_instr(packssdw))]
+pub unsafe fn _mm_packs_pi32(a: i32x2, b: i32x2) -> i16x4 {
+    mem::transmute(packssdw(mem::transmute(a), mem::transmute(b)))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.mmx.packsswb"]
+    fn packsswb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.packssdw"]
+    fn packssdw(a: __m64, b: __m64) -> __m64;
+}
+
+#[cfg(test)]
+mod tests {
+    use v64::{i16x4, i32x2, i8x8};
+    use x86::i686::mmx;
+    use x86::__m64;
+    use stdsimd_test::simd_test;
+
+    #[simd_test = "sse"] // FIXME: should be mmx
+    unsafe fn _mm_setzero_si64() {
+        let r: __m64 = ::std::mem::transmute(0_i64);
+        assert_eq!(r, mmx::_mm_setzero_si64());
+    }
+
+    #[simd_test = "sse"] // FIXME: should be mmx
+    unsafe fn _mm_packs_pi16() {
+        let a = i16x4::new(-1, 2, -3, 4);
+        let b = i16x4::new(-5, 6, -7, 8);
+        let r = i8x8::new(-1, 2, -3, 4, -5, 6, -7, 8);
+        assert_eq!(r, mmx::_mm_packs_pi16(a, b));
+    }
+
+    #[simd_test = "sse"] // FIXME: should be mmx
+    unsafe fn _mm_packs_pi32() {
+        let a = i32x2::new(-1, 2);
+        let b = i32x2::new(-5, 6);
+        let r = i16x4::new(-1, 2, -5, 6);
+        assert_eq!(r, mmx::_mm_packs_pi32(a, b));
+    }
+}
--- a/library/stdarch/coresimd/src/x86/i686/mod.rs
+++ b/library/stdarch/coresimd/src/x86/i686/mod.rs
@ -1,5 +1,8 @@
 //! `i686` intrinsics

+mod mmx;
+pub use self::mmx::*;
+
 mod sse;
 pub use self::sse::*;

--- a/library/stdarch/coresimd/src/x86/i686/sse.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse.rs
@ -1,17 +1,15 @@
 //! `i686` Streaming SIMD Extensions (SSE)

-use v64::{i16x4, u8x8};
+use v128::f32x4;
+use v64::{i16x4, i32x2, i8x8, u8x8};
+use x86::__m64;
 use core::mem;
+use x86::i586;
+use x86::i686::mmx;

 #[cfg(test)]
 use stdsimd_test::assert_instr;

-/// This type is only required for mapping vector types to llvm's `x86_mmx`
-/// type.
-#[allow(non_camel_case_types)]
-#[repr(simd)]
-struct __m64(i64);
-
 #[allow(improper_ctypes)]
 extern "C" {
    #[link_name = "llvm.x86.mmx.pmaxs.w"]
@ -22,6 +20,10 @@ extern "C" {
    fn pminsw(a: __m64, b: __m64) -> __m64;
    #[link_name = "llvm.x86.mmx.pminu.b"]
    fn pminub(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.sse.cvtps2pi"]
+    fn cvtps2pi(a: f32x4) -> __m64;
+    #[link_name = "llvm.x86.sse.cvttps2pi"]
+    fn cvttps2pi(a: f32x4) -> __m64;
 }

 /// Compares the packed 16-bit signed integers of `a` and `b` writing the
@ -96,9 +98,70 @@ pub unsafe fn _m_pminub(a: u8x8, b: u8x8) -> u8x8 {
    _mm_min_pu8(a, b)
 }

+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2 {
+    mem::transmute(cvttps2pi(a))
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 {
+    _mm_cvttps_pi32(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2 {
+    mem::transmute(cvtps2pi(a))
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 {
+    _mm_cvtps_pi32(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 16-bit integers.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi16(a: f32x4) -> i16x4 {
+    let b = _mm_cvtps_pi32(a);
+    let a = i586::_mm_movehl_ps(a, a);
+    let c = _mm_cvtps_pi32(a);
+    mmx::_mm_packs_pi32(b, c)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 8-bit integers, and returns theem in the lower 4 elements of the
+/// result.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi8(a: f32x4) -> i8x8 {
+    let b = _mm_cvtps_pi16(a);
+    let c = mmx::_mm_setzero_si64();
+    mmx::_mm_packs_pi16(b, mem::transmute(c))
+}
+
 #[cfg(test)]
 mod tests {
-    use v64::{i16x4, u8x8};
+    use v128::f32x4;
+    use v64::{i16x4, i32x2, i8x8, u8x8};
    use x86::i686::sse;
    use stdsimd_test::simd_test;

@ -141,4 +204,36 @@ mod tests {
        assert_eq!(r, sse::_mm_min_pu8(a, b));
        assert_eq!(r, sse::_m_pminub(a, b));
    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtps_pi32() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let r = i32x2::new(1, 2);
+
+        assert_eq!(r, sse::_mm_cvtps_pi32(a));
+        assert_eq!(r, sse::_mm_cvt_ps2pi(a));
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvttps_pi32() {
+        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
+        let r = i32x2::new(7, 2);
+
+        assert_eq!(r, sse::_mm_cvttps_pi32(a));
+        assert_eq!(r, sse::_mm_cvtt_ps2pi(a));
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtps_pi16() {
+        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
+        let r = i16x4::new(7, 2, 3, 4);
+        assert_eq!(r, sse::_mm_cvtps_pi16(a));
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtps_pi8() {
+        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
+        let r = i8x8::new(7, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq!(r, sse::_mm_cvtps_pi8(a));
+    }
 }
--- a/library/stdarch/coresimd/src/x86/mod.rs
+++ b/library/stdarch/coresimd/src/x86/mod.rs
@ -26,6 +26,11 @@ mod x86_64;
 #[cfg(target_arch = "x86_64")]
 pub use self::x86_64::*;

+/// 64-bit wide integer vector type.
+#[allow(non_camel_case_types)]
+#[repr(simd)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct __m64(i64); // corresponds to llvm's `x86_mmx` type
 /// 128-bit wide signed integer vector type
 #[allow(non_camel_case_types)]
 pub type __m128i = ::v128::i8x16;