Add vfma and vfms neon instructions (#1169)

This commit is contained in:
Sparrow Li 2021-05-21 19:26:21 +08:00 committed by GitHub
parent b216e9f9c4
commit 10f7ebc387
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 788 additions and 64 deletions

View file

@ -5021,7 +5021,7 @@ pub unsafe fn vfma_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float6
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v1f64")]
fn vfma_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t;
}
vfma_f64_(a, b, c)
vfma_f64_(b, c, a)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
@ -5034,7 +5034,7 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f64")]
fn vfmaq_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t;
}
vfmaq_f64_(a, b, c)
vfmaq_f64_(b, c, a)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
@ -5042,8 +5042,7 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmadd))]
pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
let d: float64x1_t = transmute(f64x1::new(c));
vfma_f64(b, transmute(d), a)
vfma_f64(a, b, vdup_n_f64(c))
}
/// Floating-point fused Multiply-Add to accumulator(vector)
@ -5051,8 +5050,301 @@ pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla))]
pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
let d: float64x2_t = transmute(f64x2::new(c, c));
vfmaq_f64(b, d, a)
vfmaq_f64(a, b, vdupq_n_f64(c))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfma_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
static_assert_imm1!(LANE);
vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfma_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
static_assert_imm2!(LANE);
vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
static_assert_imm1!(LANE);
vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
static_assert_imm2!(LANE);
vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfma_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
static_assert!(LANE : i32 where LANE == 0);
vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfma_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
static_assert_imm1!(LANE);
vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmaq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
static_assert!(LANE : i32 where LANE == 0);
vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmaq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
static_assert_imm1!(LANE);
vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmas_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")]
fn vfmas_lane_f32_(a: f32, b: f32, c: f32) -> f32;
}
static_assert_imm1!(LANE);
let c: f32 = simd_extract(c, LANE as u32);
vfmas_lane_f32_(b, c, a)
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmas_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f32")]
fn vfmas_laneq_f32_(a: f32, b: f32, c: f32) -> f32;
}
static_assert_imm2!(LANE);
let c: f32 = simd_extract(c, LANE as u32);
vfmas_laneq_f32_(b, c, a)
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmadd, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmad_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")]
fn vfmad_lane_f64_(a: f64, b: f64, c: f64) -> f64;
}
static_assert!(LANE : i32 where LANE == 0);
let c: f64 = simd_extract(c, LANE as u32);
vfmad_lane_f64_(b, c, a)
}
/// Floating-point fused multiply-add to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmla, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmad_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.f64")]
fn vfmad_laneq_f64_(a: f64, b: f64, c: f64) -> f64;
}
static_assert_imm1!(LANE);
let c: f64 = simd_extract(c, LANE as u32);
vfmad_laneq_f64_(b, c, a)
}
/// Floating-point fused multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmsub))]
pub unsafe fn vfms_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
let b: float64x1_t = simd_neg(b);
vfma_f64(a, b, c)
}
/// Floating-point fused multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls))]
pub unsafe fn vfmsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
let b: float64x2_t = simd_neg(b);
vfmaq_f64(a, b, c)
}
/// Floating-point fused Multiply-subtract to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmsub))]
pub unsafe fn vfms_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
vfms_f64(a, b, vdup_n_f64(c))
}
/// Floating-point fused Multiply-subtract to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls))]
pub unsafe fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
vfmsq_f64(a, b, vdupq_n_f64(c))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfms_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
static_assert_imm1!(LANE);
vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfms_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
static_assert_imm2!(LANE);
vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
static_assert_imm1!(LANE);
vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
static_assert_imm2!(LANE);
vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfms_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
static_assert!(LANE : i32 where LANE == 0);
vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfms_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
static_assert_imm1!(LANE);
vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmsq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
static_assert!(LANE : i32 where LANE == 0);
vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmsq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
static_assert_imm1!(LANE);
vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmss_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) -> f32 {
vfmas_lane_f32::<LANE>(a, -b, c)
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmss_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -> f32 {
vfmas_laneq_f32::<LANE>(a, -b, c)
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmsub, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmsd_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) -> f64 {
vfmad_lane_f64::<LANE>(a, -b, c)
}
/// Floating-point fused multiply-subtract to accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(fmls, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vfmsd_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -> f64 {
vfmad_laneq_f64::<LANE>(a, -b, c)
}
/// Divide
@ -13006,9 +13298,9 @@ mod test {
#[simd_test(enable = "neon")]
unsafe fn test_vfma_f64() {
let a: f64 = 2.0;
let a: f64 = 8.0;
let b: f64 = 6.0;
let c: f64 = 8.0;
let c: f64 = 2.0;
let e: f64 = 20.0;
let r: f64 = transmute(vfma_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
@ -13016,9 +13308,9 @@ mod test {
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_f64() {
let a: f64x2 = f64x2::new(2.0, 3.0);
let a: f64x2 = f64x2::new(8.0, 18.0);
let b: f64x2 = f64x2::new(6.0, 4.0);
let c: f64x2 = f64x2::new(8.0, 18.0);
let c: f64x2 = f64x2::new(2.0, 3.0);
let e: f64x2 = f64x2::new(20.0, 30.0);
let r: f64x2 = transmute(vfmaq_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
@ -13044,6 +13336,286 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfma_lane_f32() {
let a: f32x2 = f32x2::new(2., 3.);
let b: f32x2 = f32x2::new(6., 4.);
let c: f32x2 = f32x2::new(2., 0.);
let e: f32x2 = f32x2::new(14., 11.);
let r: f32x2 = transmute(vfma_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfma_laneq_f32() {
let a: f32x2 = f32x2::new(2., 3.);
let b: f32x2 = f32x2::new(6., 4.);
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
let e: f32x2 = f32x2::new(14., 11.);
let r: f32x2 = transmute(vfma_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_lane_f32() {
let a: f32x4 = f32x4::new(2., 3., 4., 5.);
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
let c: f32x2 = f32x2::new(2., 0.);
let e: f32x4 = f32x4::new(14., 11., 18., 21.);
let r: f32x4 = transmute(vfmaq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_laneq_f32() {
let a: f32x4 = f32x4::new(2., 3., 4., 5.);
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
let e: f32x4 = f32x4::new(14., 11., 18., 21.);
let r: f32x4 = transmute(vfmaq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfma_lane_f64() {
let a: f64 = 2.;
let b: f64 = 6.;
let c: f64 = 2.;
let e: f64 = 14.;
let r: f64 = transmute(vfma_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfma_laneq_f64() {
let a: f64 = 2.;
let b: f64 = 6.;
let c: f64x2 = f64x2::new(2., 0.);
let e: f64 = 14.;
let r: f64 = transmute(vfma_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_lane_f64() {
let a: f64x2 = f64x2::new(2., 3.);
let b: f64x2 = f64x2::new(6., 4.);
let c: f64 = 2.;
let e: f64x2 = f64x2::new(14., 11.);
let r: f64x2 = transmute(vfmaq_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_laneq_f64() {
let a: f64x2 = f64x2::new(2., 3.);
let b: f64x2 = f64x2::new(6., 4.);
let c: f64x2 = f64x2::new(2., 0.);
let e: f64x2 = f64x2::new(14., 11.);
let r: f64x2 = transmute(vfmaq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmas_lane_f32() {
let a: f32 = 2.;
let b: f32 = 6.;
let c: f32x2 = f32x2::new(3., 0.);
let e: f32 = 20.;
let r: f32 = transmute(vfmas_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmas_laneq_f32() {
let a: f32 = 2.;
let b: f32 = 6.;
let c: f32x4 = f32x4::new(3., 0., 0., 0.);
let e: f32 = 20.;
let r: f32 = transmute(vfmas_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmad_lane_f64() {
let a: f64 = 2.;
let b: f64 = 6.;
let c: f64 = 3.;
let e: f64 = 20.;
let r: f64 = transmute(vfmad_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmad_laneq_f64() {
let a: f64 = 2.;
let b: f64 = 6.;
let c: f64x2 = f64x2::new(3., 0.);
let e: f64 = 20.;
let r: f64 = transmute(vfmad_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_f64() {
let a: f64 = 20.0;
let b: f64 = 6.0;
let c: f64 = 2.0;
let e: f64 = 8.0;
let r: f64 = transmute(vfms_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_f64() {
let a: f64x2 = f64x2::new(20.0, 30.0);
let b: f64x2 = f64x2::new(6.0, 4.0);
let c: f64x2 = f64x2::new(2.0, 3.0);
let e: f64x2 = f64x2::new(8.0, 18.0);
let r: f64x2 = transmute(vfmsq_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_n_f64() {
let a: f64 = 50.0;
let b: f64 = 6.0;
let c: f64 = 8.0;
let e: f64 = 2.0;
let r: f64 = transmute(vfms_n_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_n_f64() {
let a: f64x2 = f64x2::new(50.0, 35.0);
let b: f64x2 = f64x2::new(6.0, 4.0);
let c: f64 = 8.0;
let e: f64x2 = f64x2::new(2.0, 3.0);
let r: f64x2 = transmute(vfmsq_n_f64(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_lane_f32() {
let a: f32x2 = f32x2::new(14., 11.);
let b: f32x2 = f32x2::new(6., 4.);
let c: f32x2 = f32x2::new(2., 0.);
let e: f32x2 = f32x2::new(2., 3.);
let r: f32x2 = transmute(vfms_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_laneq_f32() {
let a: f32x2 = f32x2::new(14., 11.);
let b: f32x2 = f32x2::new(6., 4.);
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
let e: f32x2 = f32x2::new(2., 3.);
let r: f32x2 = transmute(vfms_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_lane_f32() {
let a: f32x4 = f32x4::new(14., 11., 18., 21.);
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
let c: f32x2 = f32x2::new(2., 0.);
let e: f32x4 = f32x4::new(2., 3., 4., 5.);
let r: f32x4 = transmute(vfmsq_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_laneq_f32() {
let a: f32x4 = f32x4::new(14., 11., 18., 21.);
let b: f32x4 = f32x4::new(6., 4., 7., 8.);
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
let e: f32x4 = f32x4::new(2., 3., 4., 5.);
let r: f32x4 = transmute(vfmsq_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_lane_f64() {
let a: f64 = 14.;
let b: f64 = 6.;
let c: f64 = 2.;
let e: f64 = 2.;
let r: f64 = transmute(vfms_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_laneq_f64() {
let a: f64 = 14.;
let b: f64 = 6.;
let c: f64x2 = f64x2::new(2., 0.);
let e: f64 = 2.;
let r: f64 = transmute(vfms_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_lane_f64() {
let a: f64x2 = f64x2::new(14., 11.);
let b: f64x2 = f64x2::new(6., 4.);
let c: f64 = 2.;
let e: f64x2 = f64x2::new(2., 3.);
let r: f64x2 = transmute(vfmsq_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_laneq_f64() {
let a: f64x2 = f64x2::new(14., 11.);
let b: f64x2 = f64x2::new(6., 4.);
let c: f64x2 = f64x2::new(2., 0.);
let e: f64x2 = f64x2::new(2., 3.);
let r: f64x2 = transmute(vfmsq_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmss_lane_f32() {
let a: f32 = 14.;
let b: f32 = 6.;
let c: f32x2 = f32x2::new(2., 0.);
let e: f32 = 2.;
let r: f32 = transmute(vfmss_lane_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmss_laneq_f32() {
let a: f32 = 14.;
let b: f32 = 6.;
let c: f32x4 = f32x4::new(2., 0., 0., 0.);
let e: f32 = 2.;
let r: f32 = transmute(vfmss_laneq_f32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsd_lane_f64() {
let a: f64 = 14.;
let b: f64 = 6.;
let c: f64 = 2.;
let e: f64 = 2.;
let r: f64 = transmute(vfmsd_lane_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsd_laneq_f64() {
let a: f64 = 14.;
let b: f64 = 6.;
let c: f64x2 = f64x2::new(2., 0.);
let e: f64 = 2.;
let r: f64 = transmute(vfmsd_laneq_f64::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vdiv_f32() {
let a: f32x2 = f32x2::new(2.0, 6.0);

View file

@ -6607,7 +6607,7 @@ pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float3
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
}
vfma_f32_(a, b, c)
vfma_f32_(b, c, a)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
@ -6623,7 +6623,7 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
}
vfmaq_f32_(a, b, c)
vfmaq_f32_(b, c, a)
}
/// Floating-point fused Multiply-Add to accumulator(vector)
@ -6633,8 +6633,7 @@ vfmaq_f32_(a, b, c)
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
let d: float32x2_t = transmute(f32x2::new(c, c));
vfma_f32(b, d, a)
vfma_f32(a, b, vdup_n_f32(c))
}
/// Floating-point fused Multiply-Add to accumulator(vector)
@ -6644,8 +6643,49 @@ pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
vfmaq_f32(b, d, a)
vfmaq_f32(a, b, vdupq_n_f32(c))
}
/// Floating-point fused multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
pub unsafe fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
let b: float32x2_t = simd_neg(b);
vfma_f32(a, b, c)
}
/// Floating-point fused multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
let b: float32x4_t = simd_neg(b);
vfmaq_f32(a, b, c)
}
/// Floating-point fused Multiply-subtract to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
vfms_f32(a, b, vdup_n_f32(c))
}
/// Floating-point fused Multiply-subtract to accumulator(vector)
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))]
pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
vfmsq_f32(a, b, vdupq_n_f32(c))
}
/// Subtract
@ -19484,9 +19524,9 @@ mod test {
#[simd_test(enable = "neon")]
unsafe fn test_vfma_f32() {
let a: f32x2 = f32x2::new(2.0, 3.0);
let a: f32x2 = f32x2::new(8.0, 18.0);
let b: f32x2 = f32x2::new(6.0, 4.0);
let c: f32x2 = f32x2::new(8.0, 18.0);
let c: f32x2 = f32x2::new(2.0, 3.0);
let e: f32x2 = f32x2::new(20.0, 30.0);
let r: f32x2 = transmute(vfma_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
@ -19494,9 +19534,9 @@ mod test {
#[simd_test(enable = "neon")]
unsafe fn test_vfmaq_f32() {
let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
let a: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
let c: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
let e: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0);
let r: f32x4 = transmute(vfmaq_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
@ -19522,6 +19562,46 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_f32() {
let a: f32x2 = f32x2::new(20.0, 30.0);
let b: f32x2 = f32x2::new(6.0, 4.0);
let c: f32x2 = f32x2::new(2.0, 3.0);
let e: f32x2 = f32x2::new(8.0, 18.0);
let r: f32x2 = transmute(vfms_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_f32() {
let a: f32x4 = f32x4::new(20.0, 30.0, 40.0, 50.0);
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
let c: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
let e: f32x4 = f32x4::new(8.0, 18.0, 12.0, 10.0);
let r: f32x4 = transmute(vfmsq_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfms_n_f32() {
let a: f32x2 = f32x2::new(50.0, 35.0);
let b: f32x2 = f32x2::new(6.0, 4.0);
let c: f32 = 8.0;
let e: f32x2 = f32x2::new(2.0, 3.0);
let r: f32x2 = transmute(vfms_n_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vfmsq_n_f32() {
let a: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
let c: f32 = 8.0;
let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
let r: f32x4 = transmute(vfmsq_n_f32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vsub_s8() {
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);

View file

@ -37,7 +37,6 @@
external_doc,
allow_internal_unstable,
decl_macro,
extended_key_value_attributes,
bench_black_box
)]
#![cfg_attr(test, feature(test, abi_vectorcall))]

View file

@ -2402,31 +2402,27 @@ generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:floa
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
a = 2.0, 3.0, 4.0, 5.0
multi_fn = vfma-self-_, b, c, a
a = 8.0, 18.0, 12.0, 10.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0, 18.0, 12.0, 10.0
c = 2.0, 3.0, 4.0, 5.0
validate 20.0, 30.0, 40.0, 50.0
link-aarch64 = llvm.fma._EXT_
aarch64 = fmadd
link-aarch64 = llvm.fma._EXT_
generate float64x1_t
aarch64 = fmla
link-aarch64 = llvm.fma._EXT_
generate float64x2_t
target = fp-armv8
arm = vfma
aarch64 = fmla
link-arm = llvm.fma._EXT_
link-aarch64 = llvm.fma._EXT_
generate float*_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f64x1::new, c}
multi_fn = vfma-self-noext, b, transmute(d), a
multi_fn = vfma-self-noext, a, b, {vdup-nself-noext, c}
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
@ -2434,49 +2430,126 @@ validate 50.0, 35.0, 60.0, 69.0
aarch64 = fmadd
generate float64x1_t:float64x1_t:f64:float64x1_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f64x2::new, c, c}
multi_fn = vfma-self-noext, b, d, a
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
aarch64 = fmla
generate float64x2_t:float64x2_t:f64:float64x2_t
/// Floating-point fused Multiply-Add to accumulator(vector)
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f32x2::new, c, c}
multi_fn = vfma-self-noext, b, d, a
a = 2.0, 3.0, 4.0, 5.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
target = fp-armv8
arm = vfma
aarch64 = fmla
generate float32x2_t:float32x2_t:f32:float32x2_t
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
/// Floating-point fused Multiply-Add to accumulator(vector)
/// Floating-point fused multiply-add to accumulator
name = vfma
n-suffix
multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c}
multi_fn = vfma-self-noext, b, d, a
a = 2.0, 3.0, 4.0, 5.0
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
a = 2., 3., 4., 5.
b = 6., 4., 7., 8.
c = 2., 0., 0., 0.
n = 0
validate 14., 11., 18., 21.
aarch64 = fmla
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
aarch64 = fmadd
generate float64x1_t
aarch64 = fmla
generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
/// Floating-point fused multiply-add to accumulator
name = vfma
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = simd_extract, c:out_t, c, LANE as u32
multi_fn = vfma-in2lane-_, b, c, a
a = 2.
b = 6.
c = 3., 0., 0., 0.
n = 0
validate 20.
aarch64 = fmla
link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32
generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64
aarch64 = fmadd
generate f64:f64:float64x1_t:f64
aarch64 = fmla
generate f64:f64:float64x2_t:f64
/// Floating-point fused multiply-subtract from accumulator
name = vfms
multi_fn = simd_neg, b:in_t, b
multi_fn = vfma-self-noext, a, b, c
a = 20.0, 30.0, 40.0, 50.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 50.0, 35.0, 60.0, 69.0
c = 2.0, 3.0, 4.0, 5.0
validate 8.0, 18.0, 12.0, 10.0
aarch64 = fmsub
generate float64x1_t
aarch64 = fmls
generate float64x2_t
target = fp-armv8
arm = vfma
aarch64 = fmla
generate float32x4_t:float32x4_t:f32:float32x4_t
arm = vfms
generate float*_t
/// Floating-point fused Multiply-subtract to accumulator(vector)
name = vfms
n-suffix
multi_fn = vfms-self-noext, a, b, {vdup-nself-noext, c}
a = 50.0, 35.0, 60.0, 69.0
b = 6.0, 4.0, 7.0, 8.0
c = 8.0
validate 2.0, 3.0, 4.0, 5.0
aarch64 = fmsub
generate float64x1_t:float64x1_t:f64:float64x1_t
aarch64 = fmls
generate float64x2_t:float64x2_t:f64:float64x2_t
target = fp-armv8
arm = vfms
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
/// Floating-point fused multiply-subtract to accumulator
name = vfms
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
a = 14., 11., 18., 21.
b = 6., 4., 7., 8.
c = 2., 0., 0., 0.
n = 0
validate 2., 3., 4., 5.
aarch64 = fmls
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
aarch64 = fmsub
generate float64x1_t
aarch64 = fmls
generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
/// Floating-point fused multiply-subtract to accumulator
name = vfms
in2-lane-suffixes
constn = LANE
multi_fn = vfma-in2lane-::<LANE>, a, -b, c
a = 14.
b = 6.
c = 2., 0., 0., 0.
n = 0
validate 2.
aarch64 = fmls
generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
aarch64 = fmsub
generate f64:f64:float64x1_t:f64
aarch64 = fmls
generate f64:f64:float64x2_t:f64
/// Divide
name = vdiv

View file

@ -122,7 +122,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
"cvtpi2ps" => 25,
// core_arch/src/arm_shared/simd32
// vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
"usad8" | "vfma" => 27,
"usad8" | "vfma" | "vfms" => 27,
"qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
// Original limit was 20 instructions, but ARM DSP Intrinsics