Add sqrt and more rsqrte neon instructions (#1078)

This adds instructions for sqrt and some of the missing reciprocal square-root estimate instructions.
2021-03-13 15:27:44 +01:00 · 2021-03-13 15:27:44 +01:00 · 282cfa4db7
commit 282cfa4db7
parent 677644afb9
5 changed files with 203 additions and 34 deletions
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@ -17,7 +17,7 @@ pub unsafe fn vabd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
    #[allow(improper_ctypes)]
    extern "C" {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v1f64")]
-        fn vabd_f64_(a: float64x1_t, a: float64x1_t) -> float64x1_t;
+        fn vabd_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
    }
    vabd_f64_(a, b)
 }
@ -30,7 +30,7 @@ pub unsafe fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    #[allow(improper_ctypes)]
    extern "C" {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v2f64")]
-        fn vabdq_f64_(a: float64x2_t, a: float64x2_t) -> float64x2_t;
+        fn vabdq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
    }
    vabdq_f64_(a, b)
 }
@ -1087,7 +1087,7 @@ pub unsafe fn vmax_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
    #[allow(improper_ctypes)]
    extern "C" {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v1f64")]
-        fn vmax_f64_(a: float64x1_t, a: float64x1_t) -> float64x1_t;
+        fn vmax_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
    }
    vmax_f64_(a, b)
 }
@ -1100,7 +1100,7 @@ pub unsafe fn vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    #[allow(improper_ctypes)]
    extern "C" {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f64")]
-        fn vmaxq_f64_(a: float64x2_t, a: float64x2_t) -> float64x2_t;
+        fn vmaxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
    }
    vmaxq_f64_(a, b)
 }
@ -1113,7 +1113,7 @@ pub unsafe fn vmin_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
    #[allow(improper_ctypes)]
    extern "C" {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v1f64")]
-        fn vmin_f64_(a: float64x1_t, a: float64x1_t) -> float64x1_t;
+        fn vmin_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
    }
    vmin_f64_(a, b)
 }
@ -1126,11 +1126,69 @@ pub unsafe fn vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    #[allow(improper_ctypes)]
    extern "C" {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f64")]
-        fn vminq_f64_(a: float64x2_t, a: float64x2_t) -> float64x2_t;
+        fn vminq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
    }
    vminq_f64_(a, b)
 }

+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrt_f32(a: float32x2_t) -> float32x2_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrtq_f32(a: float32x4_t) -> float32x4_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrt_f64(a: float64x1_t) -> float64x1_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrtq_f64(a: float64x2_t) -> float64x2_t {
+    simd_fsqrt(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v1f64")]
+        fn vrsqrte_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrsqrte_f64_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+pub unsafe fn vrsqrteq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f64")]
+        fn vrsqrteq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrsqrteq_f64_(a)
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
@ -2233,4 +2291,52 @@ mod test {
        let r: f64x2 = transmute(vminq_f64(transmute(a), transmute(b)));
        assert_eq!(r, e);
    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrt_f32() {
+        let a: f32x2 = f32x2::new(4.0, 9.0);
+        let e: f32x2 = f32x2::new(2.0, 3.0);
+        let r: f32x2 = transmute(vsqrt_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrtq_f32() {
+        let a: f32x4 = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let r: f32x4 = transmute(vsqrtq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrt_f64() {
+        let a: f64 = 4.0;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vsqrt_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrtq_f64() {
+        let a: f64x2 = f64x2::new(4.0, 9.0);
+        let e: f64x2 = f64x2::new(2.0, 3.0);
+        let r: f64x2 = transmute(vsqrtq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_f64() {
+        let a: f64 = 1.0;
+        let e: f64 = 0.998046875;
+        let r: f64 = transmute(vrsqrte_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let e: f64x2 = f64x2::new(0.998046875, 0.705078125);
+        let r: f64x2 = transmute(vrsqrteq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
 }
--- a/library/stdarch/crates/core_arch/src/arm/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon/generated.rs
@ -3349,6 +3349,38 @@ pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 vminq_f32_(a, b)
 }

+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrsqrte_f32_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
+        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrsqrteq_f32_(a)
+}
+
 #[cfg(test)]
 #[allow(overflowing_literals)]
 mod test {
@ -5964,4 +5996,20 @@ mod test {
        let r: f32x4 = transmute(vminq_f32(transmute(a), transmute(b)));
        assert_eq!(r, e);
    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let e: f32x2 = f32x2::new(0.998046875, 0.705078125);
+        let r: f32x2 = transmute(vrsqrte_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e: f32x4 = f32x4::new(0.998046875, 0.705078125, 0.576171875, 0.4990234375);
+        let r: f32x4 = transmute(vrsqrteq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
 }
--- a/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
@ -136,10 +136,6 @@ extern "C" {
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v4i32")]
    fn vabsq_s32_(a: int32x4_t) -> int32x4_t;

-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
-    fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
-
    //uint32x2_t vqmovn_u64 (uint64x2_t a)
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v2i32")]
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v2i32")]
@ -2466,15 +2462,6 @@ pub unsafe fn vmovl_u32(a: uint32x2_t) -> uint64x2_t {
    simd_cast(a)
 }

-/// Reciprocal square-root estimate.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
-pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
-    frsqrte_v2f32(a)
-}
-
 /// Vector bitwise not.
 #[inline]
 #[target_feature(enable = "neon")]
@ -7906,14 +7893,6 @@ mod tests {
        assert_eq!(r, e);
    }

-    #[simd_test(enable = "neon")]
-    unsafe fn test_vrsqrt_f32() {
-        let a = f32x2::new(1.0, 2.0);
-        let e = f32x2::new(0.9980469, 0.7050781);
-        let r: f32x2 = transmute(vrsqrte_f32(transmute(a)));
-        assert_eq!(r, e);
-    }
-
    #[simd_test(enable = "neon")]
    unsafe fn test_vpmin_s8() {
        let a = i8x8::new(1, -2, 3, -4, 5, 6, 7, 8);
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@ -720,3 +720,25 @@ aarch64 = fmin
 link-arm = vmins._EXT_
 link-aarch64 = fmin._EXT_
 generate float*_t
+
+/// Calculates the square root of each lane.
+name = vsqrt
+fn = simd_fsqrt
+a = 4.0, 9.0, 16.0, 25.0
+validate 2.0, 3.0, 4.0, 5.0
+
+aarch64 = fsqrt
+generate float*_t, float64x*_t
+
+/// Reciprocal square-root estimate.
+name = vrsqrte
+a = 1.0, 2.0, 3.0, 4.0
+validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375
+
+aarch64 = frsqrte
+link-aarch64 = frsqrte._EXT_
+generate float64x*_t
+
+arm = vrsqrte
+link-arm = vrsqrte._EXT_
+generate float*_t
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@ -345,13 +345,20 @@ fn gen_aarch64(
            r#"#[allow(improper_ctypes)]
    extern "C" {{
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.{}")]
-        fn {}(a: {}, a: {}) -> {};
+        fn {}({}) -> {};
    }}
    "#,
            link_aarch64.replace("_EXT_", ext),
            current_fn,
-            in_t,
-            in_t,
+            match para_num {
+                1 => {
+                    format!("a: {}", in_t)
+                }
+                2 => {
+                    format!("a: {}, b: {}", in_t, in_t)
+                }
+                _ => unimplemented!("unknown para_num"),
+            },
            out_t
        )
    } else {
@ -527,7 +534,7 @@ fn gen_arm(
        }
        String::new()
    } else {
-        if link_aarch64.is_none() || link_arm.is_none() {
+        if link_aarch64.is_none() && link_arm.is_none() {
            panic!(
                "[{}] Either fn or link-arm and link-aarch have to be specified.",
                name
@ -544,14 +551,21 @@ fn gen_arm(
    extern "C" {{
        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.{}")]
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.{}")]
-        fn {}(a: {}, b: {}) -> {};
+        fn {}({}) -> {};
    }}
 "#,
                link_arm.replace("_EXT_", ext),
                link_aarch64.replace("_EXT_", ext),
                current_fn,
-                in_t,
-                in_t,
+                match para_num {
+                    1 => {
+                        format!("a: {}", in_t)
+                    }
+                    2 => {
+                        format!("a: {}, b: {}", in_t, in_t)
+                    }
+                    _ => unimplemented!("unknown para_num"),
+                },
                out_t
            )
        } else {