add neon instruction vfma_n_* (#1122)

This commit is contained in:
surechen 2021-04-18 00:45:54 +08:00 committed by GitHub
parent 43126c3f65
commit d46e0086e4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 138 additions and 2 deletions

View file

@ -2810,6 +2810,24 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
vfmaq_f64_(a, b, c)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmadd))]
pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
let d: float64x1_t = transmute(f64x1::new(c));
vfma_f64(b, transmute(d), a)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla))]
pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
let d: float64x2_t = transmute(f64x2::new(c, c));
vfmaq_f64(b, d, a)
}
/// Divide
#[inline]
#[target_feature(enable = "neon")]
@ -8232,6 +8250,26 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfma_n_f64() {
let a: f64 = 2.0;
let b: f64 = 6.0;
let c: f64 = 8.0;
let e: f64 = 50.0;
let r: f64 = transmute(vfma_n_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_n_f64() {
let a: f64x2 = f64x2::new(2.0, 3.0);
let b: f64x2 = f64x2::new(6.0, 4.0);
let c: f64 = 8.0;
let e: f64x2 = f64x2::new(50.0, 35.0);
let r: f64x2 = transmute(vfmaq_n_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vdiv_f32() {
let a: f32x2 = f32x2::new(2.0, 6.0);

View file

@ -4738,6 +4738,28 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
vfmaq_f32_(a, b, c)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
let d: float32x2_t = transmute(f32x2::new(c, c));
vfma_f32(b, d, a)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
vfmaq_f32(b, d, a)
}
/// Subtract
#[inline]
#[target_feature(enable = "neon")]
@ -13974,6 +13996,26 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfma_n_f32() {
let a: f32x2 = f32x2::new(2.0, 3.0);
let b: f32x2 = f32x2::new(6.0, 4.0);
let c: f32 = 8.0;
let e: f32x2 = f32x2::new(50.0, 35.0);
let r: f32x2 = transmute(vfma_n_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_n_f32() {
let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
let c: f32 = 8.0;
let e: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
let r: f32x4 = transmute(vfmaq_n_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vsub_s8() {
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);

View file

@ -1566,6 +1566,62 @@ link-arm = llvm.fma._EXT_
link-aarch64 = llvm.fma._EXT_
generate float*_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f64x1::new, c}
multi_fn = vfma-self-noext, b, transmute(d), a
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
aarch64 = fmadd
generate float64x1_t:float64x1_t:f64:float64x1_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f64x2::new, c, c}
multi_fn = vfma-self-noext, b, d, a
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
aarch64 = fmla
generate float64x2_t:float64x2_t:f64:float64x2_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f32x2::new, c, c}
multi_fn = vfma-self-noext, b, d, a
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
target = fp-armv8
arm = vfma
aarch64 = fmla
generate float32x2_t:float32x2_t:f32:float32x2_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c}
multi_fn = vfma-self-noext, b, d, a
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
target = fp-armv8
arm = vfma
aarch64 = fmla
generate float32x4_t:float32x4_t:f32:float32x4_t
/// Divide
name = vdiv
fn = simd_div

View file

@ -120,9 +120,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
// Intrinsics using `cvtpi2ps` are typically "composites" and
// in some cases exceed the limit.
"cvtpi2ps" => 25,
// core_arch/src/arm_shared/simd32
"usad8" => 27,
// vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
"usad8" | "vfma" => 27,
"qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
// Original limit was 20 instructions, but ARM DSP Intrinsics