Add vfma and vfms neon instructions (#1169)
This commit is contained in:
parent
b216e9f9c4
commit
10f7ebc387
5 changed files with 788 additions and 64 deletions
|
|
@ -5021,7 +5021,7 @@ pub unsafe fn vfma_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float6
|
|||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v1f64")]
|
||||
fn vfma_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t;
|
||||
}
|
||||
vfma_f64_(a, b, c)
|
||||
vfma_f64_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
|
|
@ -5034,7 +5034,7 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
|
|||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f64")]
|
||||
fn vfmaq_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
|
||||
}
|
||||
vfmaq_f64_(a, b, c)
|
||||
vfmaq_f64_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
|
|
@ -5042,8 +5042,7 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
|
|||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmadd))]
|
||||
pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
|
||||
let d: float64x1_t = transmute(f64x1::new(c));
|
||||
vfma_f64(b, transmute(d), a)
|
||||
vfma_f64(a, b, vdup_n_f64(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
|
|
@ -5051,8 +5050,301 @@ pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t
|
|||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla))]
|
||||
pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
|
||||
let d: float64x2_t = transmute(f64x2::new(c, c));
|
||||
vfmaq_f64(b, d, a)
|
||||
vfmaq_f64(a, b, vdupq_n_f64(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfma_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfma_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
|
||||
static_assert_imm2!(LANE);
|
||||
vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
|
||||
static_assert_imm2!(LANE);
|
||||
vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfma_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
|
||||
static_assert!(LANE : i32 where LANE == 0);
|
||||
vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfma_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmaq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
|
||||
static_assert!(LANE : i32 where LANE == 0);
|
||||
vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmaq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmas_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")]
|
||||
fn vfmas_lane_f32_(a: f32, b: f32, c: f32) -> f32;
|
||||
}
|
||||
static_assert_imm1!(LANE);
|
||||
let c: f32 = simd_extract(c, LANE as u32);
|
||||
vfmas_lane_f32_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmas_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")]
|
||||
fn vfmas_laneq_f32_(a: f32, b: f32, c: f32) -> f32;
|
||||
}
|
||||
static_assert_imm2!(LANE);
|
||||
let c: f32 = simd_extract(c, LANE as u32);
|
||||
vfmas_laneq_f32_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmad_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")]
|
||||
fn vfmad_lane_f64_(a: f64, b: f64, c: f64) -> f64;
|
||||
}
|
||||
static_assert!(LANE : i32 where LANE == 0);
|
||||
let c: f64 = simd_extract(c, LANE as u32);
|
||||
vfmad_lane_f64_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmad_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")]
|
||||
fn vfmad_laneq_f64_(a: f64, b: f64, c: f64) -> f64;
|
||||
}
|
||||
static_assert_imm1!(LANE);
|
||||
let c: f64 = simd_extract(c, LANE as u32);
|
||||
vfmad_laneq_f64_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract from accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmsub))]
|
||||
pub unsafe fn vfms_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
|
||||
let b: float64x1_t = simd_neg(b);
|
||||
vfma_f64(a, b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract from accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls))]
|
||||
pub unsafe fn vfmsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
|
||||
let b: float64x2_t = simd_neg(b);
|
||||
vfmaq_f64(a, b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-subtract to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmsub))]
|
||||
pub unsafe fn vfms_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
|
||||
vfms_f64(a, b, vdup_n_f64(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-subtract to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls))]
|
||||
pub unsafe fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
|
||||
vfmsq_f64(a, b, vdupq_n_f64(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfms_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfms_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
|
||||
static_assert_imm2!(LANE);
|
||||
vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
|
||||
static_assert_imm2!(LANE);
|
||||
vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfms_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
|
||||
static_assert!(LANE : i32 where LANE == 0);
|
||||
vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfms_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmsq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
|
||||
static_assert!(LANE : i32 where LANE == 0);
|
||||
vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmsq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
|
||||
static_assert_imm1!(LANE);
|
||||
vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmss_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
|
||||
vfmas_lane_f32::<LANE>(a, -b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmss_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
|
||||
vfmas_laneq_f32::<LANE>(a, -b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmsd_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
|
||||
vfmad_lane_f64::<LANE>(a, -b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vfmsd_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
|
||||
vfmad_laneq_f64::<LANE>(a, -b, c)
|
||||
}
|
||||
|
||||
/// Divide
|
||||
|
|
@ -13006,9 +13298,9 @@ mod test {
|
|||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_f64() {
|
||||
let a: f64 = 2.0;
|
||||
let a: f64 = 8.0;
|
||||
let b: f64 = 6.0;
|
||||
let c: f64 = 8.0;
|
||||
let c: f64 = 2.0;
|
||||
let e: f64 = 20.0;
|
||||
let r: f64 = transmute(vfma_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -13016,9 +13308,9 @@ mod test {
|
|||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_f64() {
|
||||
let a: f64x2 = f64x2::new(2.0, 3.0);
|
||||
let a: f64x2 = f64x2::new(8.0, 18.0);
|
||||
let b: f64x2 = f64x2::new(6.0, 4.0);
|
||||
let c: f64x2 = f64x2::new(8.0, 18.0);
|
||||
let c: f64x2 = f64x2::new(2.0, 3.0);
|
||||
let e: f64x2 = f64x2::new(20.0, 30.0);
|
||||
let r: f64x2 = transmute(vfmaq_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -13044,6 +13336,286 @@ mod test {
|
|||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_lane_f32() {
|
||||
let a: f32x2 = f32x2::new(2., 3.);
|
||||
let b: f32x2 = f32x2::new(6., 4.);
|
||||
let c: f32x2 = f32x2::new(2., 0.);
|
||||
let e: f32x2 = f32x2::new(14., 11.);
|
||||
let r: f32x2 = transmute(vfma_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_laneq_f32() {
|
||||
let a: f32x2 = f32x2::new(2., 3.);
|
||||
let b: f32x2 = f32x2::new(6., 4.);
|
||||
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
|
||||
let e: f32x2 = f32x2::new(14., 11.);
|
||||
let r: f32x2 = transmute(vfma_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_lane_f32() {
|
||||
let a: f32x4 = f32x4::new(2., 3., 4., 5.);
|
||||
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
|
||||
let c: f32x2 = f32x2::new(2., 0.);
|
||||
let e: f32x4 = f32x4::new(14., 11., 18., 21.);
|
||||
let r: f32x4 = transmute(vfmaq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_laneq_f32() {
|
||||
let a: f32x4 = f32x4::new(2., 3., 4., 5.);
|
||||
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
|
||||
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
|
||||
let e: f32x4 = f32x4::new(14., 11., 18., 21.);
|
||||
let r: f32x4 = transmute(vfmaq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_lane_f64() {
|
||||
let a: f64 = 2.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64 = 2.;
|
||||
let e: f64 = 14.;
|
||||
let r: f64 = transmute(vfma_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_laneq_f64() {
|
||||
let a: f64 = 2.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64x2 = f64x2::new(2., 0.);
|
||||
let e: f64 = 14.;
|
||||
let r: f64 = transmute(vfma_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_lane_f64() {
|
||||
let a: f64x2 = f64x2::new(2., 3.);
|
||||
let b: f64x2 = f64x2::new(6., 4.);
|
||||
let c: f64 = 2.;
|
||||
let e: f64x2 = f64x2::new(14., 11.);
|
||||
let r: f64x2 = transmute(vfmaq_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_laneq_f64() {
|
||||
let a: f64x2 = f64x2::new(2., 3.);
|
||||
let b: f64x2 = f64x2::new(6., 4.);
|
||||
let c: f64x2 = f64x2::new(2., 0.);
|
||||
let e: f64x2 = f64x2::new(14., 11.);
|
||||
let r: f64x2 = transmute(vfmaq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmas_lane_f32() {
|
||||
let a: f32 = 2.;
|
||||
let b: f32 = 6.;
|
||||
let c: f32x2 = f32x2::new(3., 0.);
|
||||
let e: f32 = 20.;
|
||||
let r: f32 = transmute(vfmas_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmas_laneq_f32() {
|
||||
let a: f32 = 2.;
|
||||
let b: f32 = 6.;
|
||||
let c: f32x4 = f32x4::new(3., 0., 0., 0.);
|
||||
let e: f32 = 20.;
|
||||
let r: f32 = transmute(vfmas_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmad_lane_f64() {
|
||||
let a: f64 = 2.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64 = 3.;
|
||||
let e: f64 = 20.;
|
||||
let r: f64 = transmute(vfmad_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmad_laneq_f64() {
|
||||
let a: f64 = 2.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64x2 = f64x2::new(3., 0.);
|
||||
let e: f64 = 20.;
|
||||
let r: f64 = transmute(vfmad_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_f64() {
|
||||
let a: f64 = 20.0;
|
||||
let b: f64 = 6.0;
|
||||
let c: f64 = 2.0;
|
||||
let e: f64 = 8.0;
|
||||
let r: f64 = transmute(vfms_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_f64() {
|
||||
let a: f64x2 = f64x2::new(20.0, 30.0);
|
||||
let b: f64x2 = f64x2::new(6.0, 4.0);
|
||||
let c: f64x2 = f64x2::new(2.0, 3.0);
|
||||
let e: f64x2 = f64x2::new(8.0, 18.0);
|
||||
let r: f64x2 = transmute(vfmsq_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_n_f64() {
|
||||
let a: f64 = 50.0;
|
||||
let b: f64 = 6.0;
|
||||
let c: f64 = 8.0;
|
||||
let e: f64 = 2.0;
|
||||
let r: f64 = transmute(vfms_n_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_n_f64() {
|
||||
let a: f64x2 = f64x2::new(50.0, 35.0);
|
||||
let b: f64x2 = f64x2::new(6.0, 4.0);
|
||||
let c: f64 = 8.0;
|
||||
let e: f64x2 = f64x2::new(2.0, 3.0);
|
||||
let r: f64x2 = transmute(vfmsq_n_f64(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_lane_f32() {
|
||||
let a: f32x2 = f32x2::new(14., 11.);
|
||||
let b: f32x2 = f32x2::new(6., 4.);
|
||||
let c: f32x2 = f32x2::new(2., 0.);
|
||||
let e: f32x2 = f32x2::new(2., 3.);
|
||||
let r: f32x2 = transmute(vfms_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_laneq_f32() {
|
||||
let a: f32x2 = f32x2::new(14., 11.);
|
||||
let b: f32x2 = f32x2::new(6., 4.);
|
||||
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
|
||||
let e: f32x2 = f32x2::new(2., 3.);
|
||||
let r: f32x2 = transmute(vfms_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_lane_f32() {
|
||||
let a: f32x4 = f32x4::new(14., 11., 18., 21.);
|
||||
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
|
||||
let c: f32x2 = f32x2::new(2., 0.);
|
||||
let e: f32x4 = f32x4::new(2., 3., 4., 5.);
|
||||
let r: f32x4 = transmute(vfmsq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_laneq_f32() {
|
||||
let a: f32x4 = f32x4::new(14., 11., 18., 21.);
|
||||
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
|
||||
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
|
||||
let e: f32x4 = f32x4::new(2., 3., 4., 5.);
|
||||
let r: f32x4 = transmute(vfmsq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_lane_f64() {
|
||||
let a: f64 = 14.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64 = 2.;
|
||||
let e: f64 = 2.;
|
||||
let r: f64 = transmute(vfms_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_laneq_f64() {
|
||||
let a: f64 = 14.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64x2 = f64x2::new(2., 0.);
|
||||
let e: f64 = 2.;
|
||||
let r: f64 = transmute(vfms_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_lane_f64() {
|
||||
let a: f64x2 = f64x2::new(14., 11.);
|
||||
let b: f64x2 = f64x2::new(6., 4.);
|
||||
let c: f64 = 2.;
|
||||
let e: f64x2 = f64x2::new(2., 3.);
|
||||
let r: f64x2 = transmute(vfmsq_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_laneq_f64() {
|
||||
let a: f64x2 = f64x2::new(14., 11.);
|
||||
let b: f64x2 = f64x2::new(6., 4.);
|
||||
let c: f64x2 = f64x2::new(2., 0.);
|
||||
let e: f64x2 = f64x2::new(2., 3.);
|
||||
let r: f64x2 = transmute(vfmsq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmss_lane_f32() {
|
||||
let a: f32 = 14.;
|
||||
let b: f32 = 6.;
|
||||
let c: f32x2 = f32x2::new(2., 0.);
|
||||
let e: f32 = 2.;
|
||||
let r: f32 = transmute(vfmss_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmss_laneq_f32() {
|
||||
let a: f32 = 14.;
|
||||
let b: f32 = 6.;
|
||||
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
|
||||
let e: f32 = 2.;
|
||||
let r: f32 = transmute(vfmss_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsd_lane_f64() {
|
||||
let a: f64 = 14.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64 = 2.;
|
||||
let e: f64 = 2.;
|
||||
let r: f64 = transmute(vfmsd_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsd_laneq_f64() {
|
||||
let a: f64 = 14.;
|
||||
let b: f64 = 6.;
|
||||
let c: f64x2 = f64x2::new(2., 0.);
|
||||
let e: f64 = 2.;
|
||||
let r: f64 = transmute(vfmsd_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vdiv_f32() {
|
||||
let a: f32x2 = f32x2::new(2.0, 6.0);
|
||||
|
|
|
|||
|
|
@ -6607,7 +6607,7 @@ pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float3
|
|||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
|
||||
fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
|
||||
}
|
||||
vfma_f32_(a, b, c)
|
||||
vfma_f32_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
|
|
@ -6623,7 +6623,7 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
|
|||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
|
||||
fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
|
||||
}
|
||||
vfmaq_f32_(a, b, c)
|
||||
vfmaq_f32_(b, c, a)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
|
|
@ -6633,8 +6633,7 @@ vfmaq_f32_(a, b, c)
|
|||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
|
||||
pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
|
||||
let d: float32x2_t = transmute(f32x2::new(c, c));
|
||||
vfma_f32(b, d, a)
|
||||
vfma_f32(a, b, vdup_n_f32(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
|
|
@ -6644,8 +6643,49 @@ pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t
|
|||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
|
||||
pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
|
||||
let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
|
||||
vfmaq_f32(b, d, a)
|
||||
vfmaq_f32(a, b, vdupq_n_f32(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract from accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
|
||||
pub unsafe fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
|
||||
let b: float32x2_t = simd_neg(b);
|
||||
vfma_f32(a, b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused multiply-subtract from accumulator
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
|
||||
pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
|
||||
let b: float32x4_t = simd_neg(b);
|
||||
vfmaq_f32(a, b, c)
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-subtract to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
|
||||
pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
|
||||
vfms_f32(a, b, vdup_n_f32(c))
|
||||
}
|
||||
|
||||
/// Floating-point fused Multiply-subtract to accumulator(vector)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
|
||||
pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
|
||||
vfmsq_f32(a, b, vdupq_n_f32(c))
|
||||
}
|
||||
|
||||
/// Subtract
|
||||
|
|
@ -19484,9 +19524,9 @@ mod test {
|
|||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfma_f32() {
|
||||
let a: f32x2 = f32x2::new(2.0, 3.0);
|
||||
let a: f32x2 = f32x2::new(8.0, 18.0);
|
||||
let b: f32x2 = f32x2::new(6.0, 4.0);
|
||||
let c: f32x2 = f32x2::new(8.0, 18.0);
|
||||
let c: f32x2 = f32x2::new(2.0, 3.0);
|
||||
let e: f32x2 = f32x2::new(20.0, 30.0);
|
||||
let r: f32x2 = transmute(vfma_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -19494,9 +19534,9 @@ mod test {
|
|||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmaq_f32() {
|
||||
let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
|
||||
let a: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
|
||||
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
|
||||
let c: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
|
||||
let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
|
||||
let e: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0);
|
||||
let r: f32x4 = transmute(vfmaq_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -19522,6 +19562,46 @@ mod test {
|
|||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_f32() {
|
||||
let a: f32x2 = f32x2::new(20.0, 30.0);
|
||||
let b: f32x2 = f32x2::new(6.0, 4.0);
|
||||
let c: f32x2 = f32x2::new(2.0, 3.0);
|
||||
let e: f32x2 = f32x2::new(8.0, 18.0);
|
||||
let r: f32x2 = transmute(vfms_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_f32() {
|
||||
let a: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0);
|
||||
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
|
||||
let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
|
||||
let e: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
|
||||
let r: f32x4 = transmute(vfmsq_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfms_n_f32() {
|
||||
let a: f32x2 = f32x2::new(50.0, 35.0);
|
||||
let b: f32x2 = f32x2::new(6.0, 4.0);
|
||||
let c: f32 = 8.0;
|
||||
let e: f32x2 = f32x2::new(2.0, 3.0);
|
||||
let r: f32x2 = transmute(vfms_n_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vfmsq_n_f32() {
|
||||
let a: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
|
||||
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
|
||||
let c: f32 = 8.0;
|
||||
let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
|
||||
let r: f32x4 = transmute(vfmsq_n_f32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vsub_s8() {
|
||||
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
|
|
|
|||
|
|
@ -37,7 +37,6 @@
|
|||
external_doc,
|
||||
allow_internal_unstable,
|
||||
decl_macro,
|
||||
extended_key_value_attributes,
|
||||
bench_black_box
|
||||
)]
|
||||
#![cfg_attr(test, feature(test, abi_vectorcall))]
|
||||
|
|
|
|||
|
|
@ -2402,31 +2402,27 @@ generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:floa
|
|||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
multi_fn = vfma-self-_, b, c, a
|
||||
a = 8.0, 18.0, 12.0, 10.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0, 18.0, 12.0, 10.0
|
||||
c = 2.0, 3.0, 4.0, 5.0
|
||||
validate 20.0, 30.0, 40.0, 50.0
|
||||
|
||||
link-aarch64 = llvm.fma._EXT_
|
||||
aarch64 = fmadd
|
||||
link-aarch64 = llvm.fma._EXT_
|
||||
generate float64x1_t
|
||||
|
||||
aarch64 = fmla
|
||||
link-aarch64 = llvm.fma._EXT_
|
||||
generate float64x2_t
|
||||
|
||||
target = fp-armv8
|
||||
arm = vfma
|
||||
aarch64 = fmla
|
||||
link-arm = llvm.fma._EXT_
|
||||
link-aarch64 = llvm.fma._EXT_
|
||||
generate float*_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f64x1::new, c}
|
||||
multi_fn = vfma-self-noext, b, transmute(d), a
|
||||
multi_fn = vfma-self-noext, a, b, {vdup-nself-noext, c}
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
|
|
@ -2434,49 +2430,126 @@ validate 50.0, 35.0, 60.0, 69.0
|
|||
|
||||
aarch64 = fmadd
|
||||
generate float64x1_t:float64x1_t:f64:float64x1_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f64x2::new, c, c}
|
||||
multi_fn = vfma-self-noext, b, d, a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
|
||||
aarch64 = fmla
|
||||
generate float64x2_t:float64x2_t:f64:float64x2_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f32x2::new, c, c}
|
||||
multi_fn = vfma-self-noext, b, d, a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
|
||||
target = fp-armv8
|
||||
arm = vfma
|
||||
aarch64 = fmla
|
||||
generate float32x2_t:float32x2_t:f32:float32x2_t
|
||||
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
|
||||
|
||||
/// Floating-point fused Multiply-Add to accumulator(vector)
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
name = vfma
|
||||
n-suffix
|
||||
multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c}
|
||||
multi_fn = vfma-self-noext, b, d, a
|
||||
a = 2.0, 3.0, 4.0, 5.0
|
||||
in2-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = static_assert_imm-in2_exp_len-LANE
|
||||
multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
|
||||
a = 2., 3., 4., 5.
|
||||
b = 6., 4., 7., 8.
|
||||
c = 2., 0., 0., 0.
|
||||
n = 0
|
||||
validate 14., 11., 18., 21.
|
||||
|
||||
aarch64 = fmla
|
||||
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
||||
aarch64 = fmadd
|
||||
generate float64x1_t
|
||||
aarch64 = fmla
|
||||
generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
|
||||
|
||||
/// Floating-point fused multiply-add to accumulator
|
||||
name = vfma
|
||||
in2-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = static_assert_imm-in2_exp_len-LANE
|
||||
multi_fn = simd_extract, c:out_t, c, LANE as u32
|
||||
multi_fn = vfma-in2lane-_, b, c, a
|
||||
a = 2.
|
||||
b = 6.
|
||||
c = 3., 0., 0., 0.
|
||||
n = 0
|
||||
validate 20.
|
||||
|
||||
aarch64 = fmla
|
||||
link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32
|
||||
generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
|
||||
link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64
|
||||
aarch64 = fmadd
|
||||
generate f64:f64:float64x1_t:f64
|
||||
aarch64 = fmla
|
||||
generate f64:f64:float64x2_t:f64
|
||||
|
||||
/// Floating-point fused multiply-subtract from accumulator
|
||||
name = vfms
|
||||
multi_fn = simd_neg, b:in_t, b
|
||||
multi_fn = vfma-self-noext, a, b, c
|
||||
a = 20.0, 30.0, 40.0, 50.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 50.0, 35.0, 60.0, 69.0
|
||||
c = 2.0, 3.0, 4.0, 5.0
|
||||
validate 8.0, 18.0, 12.0, 10.0
|
||||
|
||||
aarch64 = fmsub
|
||||
generate float64x1_t
|
||||
aarch64 = fmls
|
||||
generate float64x2_t
|
||||
|
||||
target = fp-armv8
|
||||
arm = vfma
|
||||
aarch64 = fmla
|
||||
generate float32x4_t:float32x4_t:f32:float32x4_t
|
||||
arm = vfms
|
||||
generate float*_t
|
||||
|
||||
/// Floating-point fused Multiply-subtract to accumulator(vector)
|
||||
name = vfms
|
||||
n-suffix
|
||||
multi_fn = vfms-self-noext, a, b, {vdup-nself-noext, c}
|
||||
a = 50.0, 35.0, 60.0, 69.0
|
||||
b = 6.0, 4.0, 7.0, 8.0
|
||||
c = 8.0
|
||||
validate 2.0, 3.0, 4.0, 5.0
|
||||
|
||||
aarch64 = fmsub
|
||||
generate float64x1_t:float64x1_t:f64:float64x1_t
|
||||
aarch64 = fmls
|
||||
generate float64x2_t:float64x2_t:f64:float64x2_t
|
||||
|
||||
target = fp-armv8
|
||||
arm = vfms
|
||||
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
name = vfms
|
||||
in2-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = static_assert_imm-in2_exp_len-LANE
|
||||
multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
|
||||
a = 14., 11., 18., 21.
|
||||
b = 6., 4., 7., 8.
|
||||
c = 2., 0., 0., 0.
|
||||
n = 0
|
||||
validate 2., 3., 4., 5.
|
||||
|
||||
aarch64 = fmls
|
||||
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
||||
aarch64 = fmsub
|
||||
generate float64x1_t
|
||||
aarch64 = fmls
|
||||
generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
|
||||
|
||||
/// Floating-point fused multiply-subtract to accumulator
|
||||
name = vfms
|
||||
in2-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = vfma-in2lane-::<LANE>, a, -b, c
|
||||
a = 14.
|
||||
b = 6.
|
||||
c = 2., 0., 0., 0.
|
||||
n = 0
|
||||
validate 2.
|
||||
|
||||
aarch64 = fmls
|
||||
generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
|
||||
aarch64 = fmsub
|
||||
generate f64:f64:float64x1_t:f64
|
||||
aarch64 = fmls
|
||||
generate f64:f64:float64x2_t:f64
|
||||
|
||||
/// Divide
|
||||
name = vdiv
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
|
|||
"cvtpi2ps" => 25,
|
||||
// core_arch/src/arm_shared/simd32
|
||||
// vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
|
||||
"usad8" | "vfma" => 27,
|
||||
"usad8" | "vfma" | "vfms" => 27,
|
||||
"qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
|
||||
|
||||
// Original limit was 20 instructions, but ARM DSP Intrinsics
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue