From d46e0086e43fa0551b96cebf4486450548caa1d4 Mon Sep 17 00:00:00 2001 From: surechen Date: Sun, 18 Apr 2021 00:45:54 +0800 Subject: [PATCH] add neon instruction vfma_n_* (#1122) --- .../core_arch/src/aarch64/neon/generated.rs | 38 +++++++++++++ .../src/arm_shared/neon/generated.rs | 42 ++++++++++++++ library/stdarch/crates/stdarch-gen/neon.spec | 56 +++++++++++++++++++ .../stdarch/crates/stdarch-test/src/lib.rs | 4 +- 4 files changed, 138 insertions(+), 2 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs index 78e11e691577..ed5b0b5423b1 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs @@ -2810,6 +2810,24 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float vfmaq_f64_(a, b, c) } +/// Floating-point fused Multiply-Add to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmadd))] +pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t { + let d: float64x1_t = transmute(f64x1::new(c)); + vfma_f64(b, transmute(d), a) +} + +/// Floating-point fused Multiply-Add to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmla))] +pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t { + let d: float64x2_t = transmute(f64x2::new(c, c)); + vfmaq_f64(b, d, a) +} + /// Divide #[inline] #[target_feature(enable = "neon")] @@ -8232,6 +8250,26 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vfma_n_f64() { + let a: f64 = 2.0; + let b: f64 = 6.0; + let c: f64 = 8.0; + let e: f64 = 50.0; + let r: f64 = transmute(vfma_n_f64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmaq_n_f64() { + let a: f64x2 = f64x2::new(2.0, 3.0); + let b: f64x2 = f64x2::new(6.0, 4.0); + let c: f64 = 8.0; + let e: f64x2 = f64x2::new(50.0, 35.0); + let r: f64x2 = transmute(vfmaq_n_f64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vdiv_f32() { let a: f32x2 = f32x2::new(2.0, 6.0); diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs index bc98607e8b42..3bfd86f03c1a 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs @@ -4738,6 +4738,28 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float vfmaq_f32_(a, b, c) } +/// Floating-point fused Multiply-Add to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))] +pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t { + let d: float32x2_t = transmute(f32x2::new(c, c)); + vfma_f32(b, d, a) +} + +/// Floating-point fused Multiply-Add to accumulator(vector) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))] +pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t { + let d: float32x4_t = transmute(f32x4::new(c, c, c, c)); + vfmaq_f32(b, d, a) +} + /// Subtract #[inline] #[target_feature(enable = "neon")] @@ -13974,6 +13996,26 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vfma_n_f32() { + let a: f32x2 = f32x2::new(2.0, 3.0); + let b: f32x2 = f32x2::new(6.0, 4.0); + let c: f32 = 8.0; + let e: f32x2 = f32x2::new(50.0, 35.0); + let r: f32x2 = transmute(vfma_n_f32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vfmaq_n_f32() { + let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0); + let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0); + let c: f32 = 8.0; + let e: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0); + let r: f32x4 = transmute(vfmaq_n_f32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vsub_s8() { let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec index bc87effb7f7d..74ca739cc4bc 100644 --- a/library/stdarch/crates/stdarch-gen/neon.spec +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -1566,6 +1566,62 @@ link-arm = llvm.fma._EXT_ link-aarch64 = llvm.fma._EXT_ generate float*_t +/// Floating-point fused Multiply-Add to accumulator(vector) +name = vfma +n-suffix +multi_fn = transmute, d:in_t, {f64x1::new, c} +multi_fn = vfma-self-noext, b, transmute(d), a +a = 2.0, 3.0, 4.0, 5.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 8.0 +validate 50.0, 35.0, 60.0, 69.0 + +aarch64 = fmadd +generate float64x1_t:float64x1_t:f64:float64x1_t + +/// Floating-point fused Multiply-Add to accumulator(vector) +name = vfma +n-suffix +multi_fn = transmute, d:in_t, {f64x2::new, c, c} +multi_fn = vfma-self-noext, b, d, a +a = 2.0, 3.0, 4.0, 5.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 8.0 +validate 50.0, 35.0, 60.0, 69.0 + +aarch64 = fmla +generate float64x2_t:float64x2_t:f64:float64x2_t + +/// Floating-point fused Multiply-Add to accumulator(vector) +name = vfma +n-suffix +multi_fn = transmute, d:in_t, {f32x2::new, c, c} +multi_fn = vfma-self-noext, b, d, a +a = 2.0, 3.0, 4.0, 5.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 8.0 +validate 50.0, 35.0, 60.0, 69.0 + +target = fp-armv8 +arm = vfma +aarch64 = fmla +generate float32x2_t:float32x2_t:f32:float32x2_t + +/// Floating-point fused Multiply-Add to accumulator(vector) +name = vfma +n-suffix +multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c} +multi_fn = vfma-self-noext, b, d, a +a = 2.0, 3.0, 4.0, 5.0 +b = 6.0, 4.0, 7.0, 8.0 +c = 8.0 +validate 50.0, 35.0, 60.0, 69.0 + +target = fp-armv8 +arm = vfma +aarch64 = fmla +generate float32x4_t:float32x4_t:f32:float32x4_t + /// Divide name = vdiv fn = simd_div diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs index 408d7190e97f..8f6aa4a267c5 100644 --- a/library/stdarch/crates/stdarch-test/src/lib.rs +++ b/library/stdarch/crates/stdarch-test/src/lib.rs @@ -120,9 +120,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { // Intrinsics using `cvtpi2ps` are typically "composites" and // in some cases exceed the limit. "cvtpi2ps" => 25, - // core_arch/src/arm_shared/simd32 - "usad8" => 27, + // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit) + "usad8" | "vfma" => 27, "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29, // Original limit was 20 instructions, but ARM DSP Intrinsics