add neon instruction vfma_n_* (#1122)
This commit is contained in:
parent
43126c3f65
commit
d46e0086e4
4 changed files with 138 additions and 2 deletions
|
|
@ -2810,6 +2810,24 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
|
|||
vfmaq_f64_(a, b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmadd))]
|
||||
pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
|
||||
let d: float64x1_t = transmute(f64x1::new(c));
|
||||
vfma_f64(b, transmute(d), a)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla))]
|
||||
pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
|
||||
let d: float64x2_t = transmute(f64x2::new(c, c));
|
||||
vfmaq_f64(b, d, a)
|
||||
}
|
||||
|
||||
/// Divide
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
|
|
@ -8232,6 +8250,26 @@ mod test {
|
|||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_n_f64() {
|
||||
let a: f64 = 2.0;
|
||||
let b: f64 = 6.0;
|
||||
let c: f64 = 8.0;
|
||||
let e: f64 = 50.0;
|
||||
let r: f64 = transmute(vfma_n_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_n_f64() {
|
||||
let a: f64x2 = f64x2::new(2.0, 3.0);
|
||||
let b: f64x2 = f64x2::new(6.0, 4.0);
|
||||
let c: f64 = 8.0;
|
||||
let e: f64x2 = f64x2::new(50.0, 35.0);
|
||||
let r: f64x2 = transmute(vfmaq_n_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vdiv_f32() {
|
||||
let a: f32x2 = f32x2::new(2.0, 6.0);
|
||||
|
|
|
|||
|
|
@ -4738,6 +4738,28 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
|
|||
vfmaq_f32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
|
||||
pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
|
||||
let d: float32x2_t = transmute(f32x2::new(c, c));
|
||||
vfma_f32(b, d, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
|
||||
pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
|
||||
let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
|
||||
vfmaq_f32(b, d, a)
|
||||
}
|
||||
|
||||
/// Subtract
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
|
|
@ -13974,6 +13996,26 @@ mod test {
|
|||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_n_f32() {
|
||||
let a: f32x2 = f32x2::new(2.0, 3.0);
|
||||
let b: f32x2 = f32x2::new(6.0, 4.0);
|
||||
let c: f32 = 8.0;
|
||||
let e: f32x2 = f32x2::new(50.0, 35.0);
|
||||
let r: f32x2 = transmute(vfma_n_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_n_f32() {
|
||||
let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
|
||||
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
|
||||
let c: f32 = 8.0;
|
||||
let e: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
|
||||
let r: f32x4 = transmute(vfmaq_n_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vsub_s8() {
|
||||
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
|
|
|
|||
|
|
@ -1566,6 +1566,62 @@ link-arm = llvm.fma._EXT_
|
|||
link-aarch64 = llvm.fma._EXT_
|
||||
generate float*_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f64x1::new, c}
|
||||
multi_fn = vfma-self-noext, b, transmute(d), a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
|
||||
aarch64 = fmadd
|
||||
generate float64x1_t:float64x1_t:f64:float64x1_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f64x2::new, c, c}
|
||||
multi_fn = vfma-self-noext, b, d, a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
|
||||
aarch64 = fmla
|
||||
generate float64x2_t:float64x2_t:f64:float64x2_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f32x2::new, c, c}
|
||||
multi_fn = vfma-self-noext, b, d, a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
|
||||
target = fp-armv8
|
||||
arm = vfma
|
||||
aarch64 = fmla
|
||||
generate float32x2_t:float32x2_t:f32:float32x2_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c}
|
||||
multi_fn = vfma-self-noext, b, d, a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
|
||||
target = fp-armv8
|
||||
arm = vfma
|
||||
aarch64 = fmla
|
||||
generate float32x4_t:float32x4_t:f32:float32x4_t
|
||||
|
||||
/// Divide
|
||||
name = vdiv
|
||||
fn = simd_div
|
||||
|
|
|
|||
|
|
@ -120,9 +120,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
|
|||
// Intrinsics using `cvtpi2ps` are typically "composites" and
|
||||
// in some cases exceed the limit.
|
||||
"cvtpi2ps" => 25,
|
||||
|
||||
// core_arch/src/arm_shared/simd32
|
||||
"usad8" => 27,
|
||||
// vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
|
||||
"usad8" | "vfma" => 27,
|
||||
"qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
|
||||
|
||||
// Original limit was 20 instructions, but ARM DSP Intrinsics
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue