Implement _mm_getcsr, _mm_setcsr, _mm_sfence (#88)

* Add _mm_sfence * Add _mm_getcsr/_mm_setcsr and convenience wrappers * Use test::black_box to simplify tests * Use uppercase naming for C-macro equivalents Discussed at https://github.com/rust-lang-nursery/stdsimd/issues/84
2017-10-05 18:17:43 +02:00 · 2017-10-05 18:17:43 +02:00 · 186b8fe093
commit 186b8fe093
parent c845a1baaf
2 changed files with 322 additions and 1 deletions
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@ -119,11 +119,14 @@
    const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
    target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
 )]
-#![cfg_attr(test, feature(proc_macro))]
+#![cfg_attr(test, feature(proc_macro, test))]

 #[cfg(test)]
 extern crate stdsimd_test;

+#[cfg(test)]
+extern crate test;
+
 /// Platform independent SIMD vector types and operations.
 pub mod simd {
    pub use v128::*;
--- a/library/stdarch/src/x86/sse.rs
+++ b/library/stdarch/src/x86/sse.rs
@ -268,6 +268,259 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
    movmskps(a)
 }

+/// Perform a serializing operation on all store-to-memory instructions that
+/// were issued prior to this instruction.
+///
+/// Guarantees that every store instruction that precedes, in program order, is
+/// globally visible before any store instruction which follows the fence in
+/// program order.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(sfence))]
+pub unsafe fn _mm_sfence() {
+    sfence()
+}
+
+/// Get the unsigned 32-bit value of the MXCSR control and status register.
+///
+/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(stmxcsr))]
+pub unsafe fn _mm_getcsr() -> u32 {
+    let mut result = 0i32;
+    stmxcsr((&mut result) as *mut _ as *mut i8);
+    result as u32
+}
+
+/// Set the MXCSR register with the 32-bit unsigned integer value.
+///
+/// This register constrols how SIMD instructions handle floating point
+/// operations. Modifying this register only affects the current thread.
+///
+/// It contains several groups of flags:
+///
+/// * *Exception flags* report which exceptions occurred since last they were
+/// reset.
+///
+/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
+/// these flags are all set to 1, so all exceptions are masked. When an
+/// an exception is masked, the processor simply sets the exception flag and
+/// continues the operation. If the exception is unmasked, the flag is also set
+/// but additionally an exception handler is invoked.
+///
+/// * *Rounding mode flags* control the rounding mode of floating point
+/// instructions.
+///
+/// * The *denormals-are-zero mode flag* turns all numbers which would be
+/// denormalized (exponent bits are all zeros) into zeros.
+///
+/// ## Exception Flags
+///
+/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
+///   Infinity by Infinity).
+///
+/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
+///   number. Mainly this can cause loss of precision.
+///
+/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
+///
+/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
+///   result was too large to be represented (e.g., an `f32` with absolute value
+///   greater than `2^128`).
+///
+/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
+///   result was too small to be represented in a normalized way (e.g., an `f32`
+///   with absulte value smaller than `2^-126`.)
+///
+/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
+///   precision exception). This means some precision was lost due to rounding.
+///   For example, the fraction `1/3` cannot be represented accurately in a
+///   32 or 64 bit float and computing it would cause this exception to be
+///   raised. Precision exceptions are very common, so they are usually masked.
+///
+/// Exception flags can be read and set using the convenience functions
+/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
+/// check if an operation caused some overflow:
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_STATE(0);  // clear all exception flags
+/// // perform calculations
+/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
+///     // handle overflow
+/// }
+/// ```
+///
+/// ## Masking Flags
+///
+/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
+/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
+/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
+///
+/// A single masking bit can be set via
+///
+/// ```rust,ignore
+/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
+/// ```
+///
+/// However, since mask bits are by default all set to 1, it is more common to
+/// want to *disable* certain bits. For example, to unmask the underflow
+/// exception, use:
+///
+/// ```rust,ignore
+/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW);  // unmask underflow exception
+/// ```
+///
+/// Warning: an unmasked exception will cause an exception handler to be called.
+/// The standard handler will simply terminate the process. So, in this case
+/// any underflow exception would terminate the current process with something
+/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
+///
+/// ## Rounding Mode
+///
+/// The rounding mode is describe using two bits. It can be read and set using
+/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
+/// `_MM_SET_ROUNDING_MODE(mode)`.
+///
+/// The rounding modes are:
+///
+/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
+///   value. If two values are equally close, round to even (i.e., least
+///   significant bit will be zero).
+///
+/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
+///
+/// * `_MM_ROUND_UP`: Round toward positive Infinity.
+///
+/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
+///
+/// Example:
+///
+/// ```rust,ignore
+/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
+/// ```
+///
+/// ## Denormals-are-zero/Flush-to-zero Mode
+///
+/// If this bit is set, values that would be denormalized will be set to zero
+/// instead. This is turned off by default.
+///
+/// You can read and enable/disable this mode via the helper functions
+/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
+///
+/// ```rust,ignore
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);  // turn off (default)
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);  // turn on
+/// ```
+///
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(ldmxcsr))]
+pub unsafe fn _mm_setcsr(val: u32) {
+    ldmxcsr(&val as *const _ as *const i8);
+}
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_EXCEPT_INVALID: u32    = 0x0001;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_EXCEPT_DENORM: u32     = 0x0002;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_EXCEPT_DIV_ZERO: u32   = 0x0004;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_EXCEPT_OVERFLOW: u32   = 0x0008;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_EXCEPT_UNDERFLOW: u32  = 0x0010;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_EXCEPT_INEXACT: u32    = 0x0020;
+pub const _MM_EXCEPT_MASK: u32       = 0x003f;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_MASK_INVALID: u32      = 0x0080;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_MASK_DENORM: u32       = 0x0100;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_MASK_DIV_ZERO: u32     = 0x0200;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_MASK_OVERFLOW: u32     = 0x0400;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_MASK_UNDERFLOW: u32    = 0x0800;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_MASK_INEXACT: u32      = 0x1000;
+pub const _MM_MASK_MASK: u32         = 0x1f80;
+
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_ROUND_NEAREST: u32     = 0x0000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_ROUND_DOWN: u32        = 0x2000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_ROUND_UP: u32          = 0x4000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
+pub const _MM_ROUND_MASK: u32        = 0x6000;
+
+pub const _MM_FLUSH_ZERO_MASK: u32   = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_FLUSH_ZERO_ON: u32     = 0x8000;
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
+pub const _MM_FLUSH_ZERO_OFF: u32    = 0x0000;
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
+    _mm_getcsr() & _MM_MASK_MASK
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
+    _mm_getcsr() & _MM_EXCEPT_MASK
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
+    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
+    _mm_getcsr() & _MM_ROUND_MASK
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
+    let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
+    //println!("setting csr={:x}", val);
+    _mm_setcsr(val)
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[target_feature = "+sse"]
+pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
+    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
+}

 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
 pub const _MM_HINT_T0: i8 = 3;
@ -374,6 +627,12 @@ extern {
    fn maxps(a: f32x4, b: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.movmsk.ps"]
    fn movmskps(a: f32x4) -> i32;
+    #[link_name = "llvm.x86.sse.sfence"]
+    fn sfence();
+    #[link_name = "llvm.x86.sse.stmxcsr"]
+    fn stmxcsr(p: *mut i8);
+    #[link_name = "llvm.x86.sse.ldmxcsr"]
+    fn ldmxcsr(p: *const i8);
    #[link_name = "llvm.prefetch"]
    fn prefetch(p: *const c_void, rw: i32, loc: i32, ty: i32);
 }
@ -383,6 +642,7 @@ mod tests {
    use v128::*;
    use x86::sse;
    use stdsimd_test::simd_test;
+    use test::black_box;  // Used to inhibit constant-folding.

    #[simd_test = "sse"]
    unsafe fn _mm_add_ps() {
@ -577,4 +837,62 @@ mod tests {
        let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
        assert_eq!(r, 0b0111);
    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_sfence() {
+        sse::_mm_sfence();
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_getcsr_setcsr_1() {
+        let saved_csr = sse::_mm_getcsr();
+
+        let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
+        let b = f32x4::new(0.001, 0.0, 0.0, 1.0);
+
+        sse::_MM_SET_FLUSH_ZERO_MODE(sse::_MM_FLUSH_ZERO_ON);
+        let r = sse::_mm_mul_ps(black_box(a), black_box(b));
+
+        sse::_mm_setcsr(saved_csr);
+
+        let exp = f32x4::new(0.0, 0.0, 0.0, 1.0);
+        assert_eq!(r, exp);  // first component is a denormalized f32
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_getcsr_setcsr_2() {
+        // Same as _mm_setcsr_1 test, but with opposite flag value.
+
+        let saved_csr = sse::_mm_getcsr();
+
+        let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
+        let b = f32x4::new(0.001, 0.0, 0.0, 1.0);
+
+        sse::_MM_SET_FLUSH_ZERO_MODE(sse::_MM_FLUSH_ZERO_OFF);
+        let r = sse::_mm_mul_ps(black_box(a), black_box(b));
+
+        sse::_mm_setcsr(saved_csr);
+
+        let exp = f32x4::new(1.1e-39, 0.0, 0.0, 1.0);
+        assert_eq!(r, exp);  // first component is a denormalized f32
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_getcsr_setcsr_underflow() {
+        sse::_MM_SET_EXCEPTION_STATE(0);
+
+        let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
+        let b = f32x4::new(1e-5, 0.0, 0.0, 1.0);
+
+        assert_eq!(sse::_MM_GET_EXCEPTION_STATE(), 0);  // just to be sure
+
+        let r = sse::_mm_mul_ps(black_box(a), black_box(b));
+
+        let exp = f32x4::new(1.1e-41, 0.0, 0.0, 1.0);
+        assert_eq!(r, exp);
+
+        let underflow =
+            sse::_MM_GET_EXCEPTION_STATE() & sse::_MM_EXCEPT_UNDERFLOW != 0;
+        assert_eq!(underflow, true);
+    }
 }