Merge pull request #38 from kocsis1david/master

Some SSE instructions
2017-09-25 14:05:38 -05:00 · 2017-09-25 14:05:38 -05:00 · 4d504a7767
commit 4d504a7767
parent d888d2d350 1d3f72c336
2 changed files with 265 additions and 15 deletions
--- a/library/stdarch/TODO.md
+++ b/library/stdarch/TODO.md
@ -59,23 +59,23 @@ sse
 * [ ] `_m_pmovmskb`
 * [ ] `_mm_shuffle_pi16`
 * [ ] `_m_pshufw`
-* [ ] `_mm_add_ss`
-* [ ] `_mm_add_ps`
-* [ ] `_mm_sub_ss`
-* [ ] `_mm_sub_ps`
-* [ ] `_mm_mul_ss`
-* [ ] `_mm_mul_ps`
-* [ ] `_mm_div_ss`
-* [ ] `_mm_div_ps`
-* [ ] `_mm_sqrt_ss`
+* [x] `_mm_add_ss`
+* [x] `_mm_add_ps`
+* [x] `_mm_sub_ss`
+* [x] `_mm_sub_ps`
+* [x] `_mm_mul_ss`
+* [x] `_mm_mul_ps`
+* [x] `_mm_div_ss`
+* [x] `_mm_div_ps`
+* [x] `_mm_sqrt_ss`
 * [x] `_mm_sqrt_ps`
-* [ ] `_mm_rcp_ss`
+* [x] `_mm_rcp_ss`
 * [x] `_mm_rcp_ps`
-* [ ] `_mm_rsqrt_ss`
+* [x] `_mm_rsqrt_ss`
 * [x] `_mm_rsqrt_ps`
-* [ ] `_mm_min_ss`
+* [x] `_mm_min_ss`
 * [x] `_mm_min_ps`
-* [ ] `_mm_max_ss`
+* [x] `_mm_max_ss`
 * [x] `_mm_max_ps`
 * [ ] `_mm_and_ps`
 * [ ] `_mm_andnot_ps`
@ -458,8 +458,8 @@ sse4.1
 * [ ] `_mm_blendv_ps`
 * [x] `_mm_blendv_epi8`
 * [ ] `_mm_blend_epi16`
-* [ ] `_mm_dp_pd`
-* [ ] `_mm_dp_ps`
+* [x] `_mm_dp_pd`
+* [x] `_mm_dp_ps`
 * [ ] `_mm_extract_ps`
 * [ ] `_mm_extract_epi8`
 * [ ] `_mm_extract_epi32`
--- a/library/stdarch/src/x86/sse.rs
+++ b/library/stdarch/src/x86/sse.rs
@ -4,6 +4,83 @@ use v128::*;
 #[cfg(test)]
 use assert_instr::assert_instr;

+/// Adds the first component of `a` and `b`, the other components are copied
+/// from `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(addss))]
+pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { addss(a, b) }
+}
+
+/// Adds f32x4 vectors.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(addps))]
+pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
+    a + b
+}
+
+/// Subtracts the first component of `b` from `a`, the other components are
+/// copied from `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(subss))]
+pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { subss(a, b) }
+}
+
+/// Subtracts f32x4 vectors.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(subps))]
+pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
+    a - b
+}
+
+/// Multiplies the first component of `a` and `b`, the other components are
+/// copied from `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(mulss))]
+pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { mulss(a, b) }
+}
+
+/// Multiplies f32x4 vectors.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(mulps))]
+pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
+    a * b
+}
+
+/// Divides the first component of `b` by `a`, the other components are
+/// copied from `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(divss))]
+pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { divss(a, b) }
+}
+
+/// Divides f32x4 vectors.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(divps))]
+pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
+    a / b
+}
+
+/// Return the square root of the first single-precision (32-bit)
+/// floating-point element in `a`, the other elements are unchanged.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(sqrtss))]
+pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
+    unsafe { sqrtss(a) }
+}
+
 /// Return the square root of packed single-precision (32-bit) floating-point
 /// elements in `a`.
 #[inline(always)]
@ -13,6 +90,15 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
    unsafe { sqrtps(a) }
 }

+/// Return the approximate reciprocal of the first single-precision
+/// (32-bit) floating-point element in `a`, the other elements are unchanged.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(rcpss))]
+pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
+    unsafe { rcpss(a) }
+}
+
 /// Return the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`.
 #[inline(always)]
@ -22,6 +108,15 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
    unsafe { rcpps(a) }
 }

+/// Return the approximate reciprocal square root of the fist single-precision
+/// (32-bit) floating-point elements in `a`, the other elements are unchanged.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(rsqrtss))]
+pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
+    unsafe { rsqrtss(a) }
+}
+
 /// Return the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`.
 #[inline(always)]
@ -31,6 +126,16 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
    unsafe { rsqrtps(a) }
 }

+/// Compare the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the minimum value in the first element of the return 
+/// value, the other elements are copied from `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(minss))]
+pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { minss(a, b) }
+}
+
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding minimum values.
 #[inline(always)]
@ -40,6 +145,16 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
    unsafe { minps(a, b) }
 }

+/// Compare the first single-precision (32-bit) floating-point element of `a`
+/// and `b`, and return the maximum value in the first element of the return 
+/// value, the other elements are copied from `a`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(maxss))]
+pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
+    unsafe { maxss(a, b) }
+}
+
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding maximum values.
 #[inline(always)]
@ -70,14 +185,32 @@ pub fn _mm_movemask_ps(a: f32x4) -> i32 {

 #[allow(improper_ctypes)]
 extern {
+    #[link_name = "llvm.x86.sse.add.ss"]
+    fn addss(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.sub.ss"]
+    fn subss(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.mul.ss"]
+    fn mulss(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.div.ss"]
+    fn divss(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.sqrt.ss"]
+    fn sqrtss(a: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.sqrt.ps"]
    fn sqrtps(a: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.rcp.ss"]
+    fn rcpss(a: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.rcp.ps"]
    fn rcpps(a: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.rsqrt.ss"]
+    fn rsqrtss(a: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.rsqrt.ps"]
    fn rsqrtps(a: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.min.ss"]
+    fn minss(a: f32x4, b: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.min.ps"]
    fn minps(a: f32x4, b: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse.max.ss"]
+    fn maxss(a: f32x4, b: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.max.ps"]
    fn maxps(a: f32x4, b: f32x4) -> f32x4;
    #[link_name = "llvm.x86.sse.movmsk.ps"]
@ -89,6 +222,87 @@ mod tests {
    use v128::*;
    use x86::sse;

+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_add_ps() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_add_ps(a, b);
+        assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_add_ss() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_add_ss(a, b);
+        assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_sub_ps() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_sub_ps(a, b);
+        assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_sub_ss() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_sub_ss(a, b);
+        assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_mul_ps() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_mul_ps(a, b);
+        assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_mul_ss() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_mul_ss(a, b);
+        assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_div_ps() {
+        let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
+        let r = sse::_mm_div_ps(a, b);
+        assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_div_ss() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_div_ss(a, b);
+        assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
+    }
+
+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_sqrt_ss() {
+        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
+        let r = sse::_mm_sqrt_ss(a);
+        let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
+        assert_eq!(r, e);
+    }
+
    #[test]
    #[target_feature = "+sse"]
    fn _mm_sqrt_ps() {
@ -98,6 +312,15 @@ mod tests {
        assert_eq!(r, e);
    }

+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_rcp_ss() {
+        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
+        let r = sse::_mm_rcp_ss(a);
+        let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
+        assert_eq!(r, e);
+    }
+
    #[test]
    #[target_feature = "+sse"]
    fn _mm_rcp_ps() {
@ -107,6 +330,15 @@ mod tests {
        assert_eq!(r, e);
    }

+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_rsqrt_ss() {
+        let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
+        let r = sse::_mm_rsqrt_ss(a);
+        let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
+        assert_eq!(r, e);
+    }
+
    #[test]
    #[target_feature = "+sse"]
    fn _mm_rsqrt_ps() {
@ -116,6 +348,15 @@ mod tests {
        assert_eq!(r, e);
    }

+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_min_ss() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_min_ss(a, b);
+        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
+    }
+
    #[test]
    #[target_feature = "+sse"]
    fn _mm_min_ps() {
@ -125,6 +366,15 @@ mod tests {
        assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
    }

+    #[test]
+    #[target_feature = "+sse"]
+    fn _mm_max_ss() {
+        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
+        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
+        let r = sse::_mm_max_ss(a, b);
+        assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
+    }
+
    #[test]
    #[target_feature = "+sse"]
    fn _mm_max_ps() {