From fd29f9602cb37b04523bfee65e613a1994d9e879 Mon Sep 17 00:00:00 2001 From: Sparrow Li Date: Sat, 1 May 2021 04:09:41 +0800 Subject: [PATCH] Add vmul_n, vmul_lane, vmulx neon instructions (#1147) --- .../core_arch/src/aarch64/neon/generated.rs | 770 +++++++++++++++ .../crates/core_arch/src/aarch64/neon/mod.rs | 41 - .../src/arm_shared/neon/generated.rs | 904 ++++++++++++++++++ library/stdarch/crates/stdarch-gen/neon.spec | 273 +++++- .../stdarch/crates/stdarch-gen/src/main.rs | 393 ++++++-- 5 files changed, 2263 insertions(+), 118 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs index 63fa745c5ae9..88c349e2d586 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs @@ -3934,6 +3934,106 @@ pub unsafe fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { simd_mul(a, b) } +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul))] +pub unsafe fn vmul_n_f64(a: float64x1_t, b: f64) -> float64x1_t { + simd_mul(a, vdup_n_f64(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul))] +pub unsafe fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t { + simd_mul(a, vdupq_n_f64(b)) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_lane_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { + static_assert!(LANE : i32 where LANE == 0); + simd_mul(a, transmute::(simd_extract(b, LANE as u32))) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_laneq_f64(a: float64x1_t, b: float64x2_t) -> float64x1_t { + static_assert_imm1!(LANE); + simd_mul(a, transmute::(simd_extract(b, LANE as u32))) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_lane_f64(a: float64x2_t, b: float64x1_t) -> float64x2_t { + static_assert!(LANE : i32 where LANE == 0); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmuls_lane_f32(a: f32, b: float32x2_t) -> f32 { + static_assert_imm1!(LANE); + let b: f32 = simd_extract(b, LANE as u32); + a * b +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmuls_laneq_f32(a: f32, b: float32x4_t) -> f32 { + static_assert_imm2!(LANE); + let b: f32 = simd_extract(b, LANE as u32); + a * b +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmuld_lane_f64(a: f64, b: float64x1_t) -> f64 { + static_assert!(LANE : i32 where LANE == 0); + let b: f64 = simd_extract(b, LANE as u32); + a * b +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmuld_laneq_f64(a: f64, b: float64x2_t) -> f64 { + static_assert_imm1!(LANE); + let b: f64 = simd_extract(b, LANE as u32); + a * b +} + /// Signed multiply long #[inline] #[target_feature(enable = "neon")] @@ -4004,6 +4104,316 @@ pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t { vmull_p8(a, b) } +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2))] +pub unsafe fn vmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t { + vmull_high_s16(a, vdupq_n_s16(b)) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2))] +pub unsafe fn vmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t { + vmull_high_s32(a, vdupq_n_s32(b)) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2))] +pub unsafe fn vmull_high_n_u16(a: uint16x8_t, b: u16) -> uint32x4_t { + vmull_high_u16(a, vdupq_n_u16(b)) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2))] +pub unsafe fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t { + vmull_high_u32(a, vdupq_n_u32(b)) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_lane_s16(a: int16x8_t, b: int16x4_t) -> int32x4_t { + static_assert_imm2!(LANE); + vmull_high_s16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_laneq_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + static_assert_imm3!(LANE); + vmull_high_s16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_lane_s32(a: int32x4_t, b: int32x2_t) -> int64x2_t { + static_assert_imm1!(LANE); + vmull_high_s32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_laneq_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + static_assert_imm2!(LANE); + vmull_high_s32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_lane_u16(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t { + static_assert_imm2!(LANE); + vmull_high_u16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_laneq_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { + static_assert_imm3!(LANE); + vmull_high_u16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_lane_u32(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t { + static_assert_imm1!(LANE); + vmull_high_u32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_high_laneq_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { + static_assert_imm2!(LANE); + vmull_high_u32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx))] +pub unsafe fn vmulx_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v2f32")] + fn vmulx_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t; + } + vmulx_f32_(a, b) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx))] +pub unsafe fn vmulxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v4f32")] + fn vmulxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t; + } + vmulxq_f32_(a, b) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx))] +pub unsafe fn vmulx_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v1f64")] + fn vmulx_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t; + } + vmulx_f64_(a, b) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx))] +pub unsafe fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v2f64")] + fn vmulxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t; + } + vmulxq_f64_(a, b) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulx_lane_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { + static_assert!(LANE : i32 where LANE == 0); + vmulx_f64(a, transmute::(simd_extract(b, LANE as u32))) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulx_laneq_f64(a: float64x1_t, b: float64x2_t) -> float64x1_t { + static_assert_imm1!(LANE); + vmulx_f64(a, transmute::(simd_extract(b, LANE as u32))) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulx_lane_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { + static_assert_imm1!(LANE); + vmulx_f32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulx_laneq_f32(a: float32x2_t, b: float32x4_t) -> float32x2_t { + static_assert_imm2!(LANE); + vmulx_f32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxq_lane_f32(a: float32x4_t, b: float32x2_t) -> float32x4_t { + static_assert_imm1!(LANE); + vmulxq_f32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { + static_assert_imm2!(LANE); + vmulxq_f32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxq_lane_f64(a: float64x2_t, b: float64x1_t) -> float64x2_t { + static_assert!(LANE : i32 where LANE == 0); + vmulxq_f64(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { + static_assert_imm1!(LANE); + vmulxq_f64(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx))] +pub unsafe fn vmulxs_f32(a: f32, b: f32) -> f32 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.f32")] + fn vmulxs_f32_(a: f32, b: f32) -> f32; + } + vmulxs_f32_(a, b) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx))] +pub unsafe fn vmulxd_f64(a: f64, b: f64) -> f64 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.f64")] + fn vmulxd_f64_(a: f64, b: f64) -> f64; + } + vmulxd_f64_(a, b) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxs_lane_f32(a: f32, b: float32x2_t) -> f32 { + static_assert_imm1!(LANE); + vmulxs_f32(a, simd_extract(b, LANE as u32)) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxs_laneq_f32(a: f32, b: float32x4_t) -> f32 { + static_assert_imm2!(LANE); + vmulxs_f32(a, simd_extract(b, LANE as u32)) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxd_lane_f64(a: f64, b: float64x1_t) -> f64 { + static_assert!(LANE : i32 where LANE == 0); + vmulxd_f64(a, simd_extract(b, LANE as u32)) +} + +/// Floating-point multiply extended +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(fmulx, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulxd_laneq_f64(a: f64, b: float64x2_t) -> f64 { + static_assert_imm1!(LANE); + vmulxd_f64(a, simd_extract(b, LANE as u32)) +} + /// Floating-point fused Multiply-Add to accumulator(vector) #[inline] #[target_feature(enable = "neon")] @@ -10814,6 +11224,96 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmul_n_f64() { + let a: f64 = 1.; + let b: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vmul_n_f64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_n_f64() { + let a: f64x2 = f64x2::new(1., 2.); + let b: f64 = 2.; + let e: f64x2 = f64x2::new(2., 4.); + let r: f64x2 = transmute(vmulq_n_f64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_lane_f64() { + let a: f64 = 1.; + let b: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vmul_lane_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_laneq_f64() { + let a: f64 = 1.; + let b: f64x2 = f64x2::new(2., 0.); + let e: f64 = 2.; + let r: f64 = transmute(vmul_laneq_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_lane_f64() { + let a: f64x2 = f64x2::new(1., 2.); + let b: f64 = 2.; + let e: f64x2 = f64x2::new(2., 4.); + let r: f64x2 = transmute(vmulq_lane_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_laneq_f64() { + let a: f64x2 = f64x2::new(1., 2.); + let b: f64x2 = f64x2::new(2., 0.); + let e: f64x2 = f64x2::new(2., 4.); + let r: f64x2 = transmute(vmulq_laneq_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmuls_lane_f32() { + let a: f32 = 1.; + let b: f32x2 = f32x2::new(2., 0.); + let e: f32 = 2.; + let r: f32 = transmute(vmuls_lane_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmuls_laneq_f32() { + let a: f32 = 1.; + let b: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32 = 2.; + let r: f32 = transmute(vmuls_laneq_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmuld_lane_f64() { + let a: f64 = 1.; + let b: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vmuld_lane_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmuld_laneq_f64() { + let a: f64 = 1.; + let b: f64x2 = f64x2::new(2., 0.); + let e: f64 = 2.; + let r: f64 = transmute(vmuld_laneq_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmull_high_s8() { let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16); @@ -10877,6 +11377,276 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_n_s16() { + let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: i16 = 2; + let e: i32x4 = i32x4::new(18, 20, 22, 24); + let r: i32x4 = transmute(vmull_high_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_n_s32() { + let a: i32x4 = i32x4::new(1, 2, 9, 10); + let b: i32 = 2; + let e: i64x2 = i64x2::new(18, 20); + let r: i64x2 = transmute(vmull_high_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_n_u16() { + let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: u16 = 2; + let e: u32x4 = u32x4::new(18, 20, 22, 24); + let r: u32x4 = transmute(vmull_high_n_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_n_u32() { + let a: u32x4 = u32x4::new(1, 2, 9, 10); + let b: u32 = 2; + let e: u64x2 = u64x2::new(18, 20); + let r: u64x2 = transmute(vmull_high_n_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_lane_s16() { + let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i32x4 = i32x4::new(18, 20, 22, 24); + let r: i32x4 = transmute(vmull_high_lane_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_laneq_s16() { + let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i32x4 = i32x4::new(18, 20, 22, 24); + let r: i32x4 = transmute(vmull_high_laneq_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_lane_s32() { + let a: i32x4 = i32x4::new(1, 2, 9, 10); + let b: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(18, 20); + let r: i64x2 = transmute(vmull_high_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_laneq_s32() { + let a: i32x4 = i32x4::new(1, 2, 9, 10); + let b: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i64x2 = i64x2::new(18, 20); + let r: i64x2 = transmute(vmull_high_laneq_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_lane_u16() { + let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: u16x4 = u16x4::new(0, 2, 0, 0); + let e: u32x4 = u32x4::new(18, 20, 22, 24); + let r: u32x4 = transmute(vmull_high_lane_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_laneq_u16() { + let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: u32x4 = u32x4::new(18, 20, 22, 24); + let r: u32x4 = transmute(vmull_high_laneq_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_lane_u32() { + let a: u32x4 = u32x4::new(1, 2, 9, 10); + let b: u32x2 = u32x2::new(0, 2); + let e: u64x2 = u64x2::new(18, 20); + let r: u64x2 = transmute(vmull_high_lane_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_laneq_u32() { + let a: u32x4 = u32x4::new(1, 2, 9, 10); + let b: u32x4 = u32x4::new(0, 2, 0, 0); + let e: u64x2 = u64x2::new(18, 20); + let r: u64x2 = transmute(vmull_high_laneq_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulx_f32() { + let a: f32x2 = f32x2::new(1., 2.); + let b: f32x2 = f32x2::new(2., 2.); + let e: f32x2 = f32x2::new(2., 4.); + let r: f32x2 = transmute(vmulx_f32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxq_f32() { + let a: f32x4 = f32x4::new(1., 2., 3., 4.); + let b: f32x4 = f32x4::new(2., 2., 2., 2.); + let e: f32x4 = f32x4::new(2., 4., 6., 8.); + let r: f32x4 = transmute(vmulxq_f32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulx_f64() { + let a: f64 = 1.; + let b: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vmulx_f64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxq_f64() { + let a: f64x2 = f64x2::new(1., 2.); + let b: f64x2 = f64x2::new(2., 2.); + let e: f64x2 = f64x2::new(2., 4.); + let r: f64x2 = transmute(vmulxq_f64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulx_lane_f64() { + let a: f64 = 1.; + let b: f64 = 2.; + let e: f64 = 2.; + let r: f64 = transmute(vmulx_lane_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulx_laneq_f64() { + let a: f64 = 1.; + let b: f64x2 = f64x2::new(2., 0.); + let e: f64 = 2.; + let r: f64 = transmute(vmulx_laneq_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulx_lane_f32() { + let a: f32x2 = f32x2::new(1., 2.); + let b: f32x2 = f32x2::new(2., 0.); + let e: f32x2 = f32x2::new(2., 4.); + let r: f32x2 = transmute(vmulx_lane_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulx_laneq_f32() { + let a: f32x2 = f32x2::new(1., 2.); + let b: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x2 = f32x2::new(2., 4.); + let r: f32x2 = transmute(vmulx_laneq_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxq_lane_f32() { + let a: f32x4 = f32x4::new(1., 2., 3., 4.); + let b: f32x2 = f32x2::new(2., 0.); + let e: f32x4 = f32x4::new(2., 4., 6., 8.); + let r: f32x4 = transmute(vmulxq_lane_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxq_laneq_f32() { + let a: f32x4 = f32x4::new(1., 2., 3., 4.); + let b: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x4 = f32x4::new(2., 4., 6., 8.); + let r: f32x4 = transmute(vmulxq_laneq_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxq_lane_f64() { + let a: f64x2 = f64x2::new(1., 2.); + let b: f64 = 2.; + let e: f64x2 = f64x2::new(2., 4.); + let r: f64x2 = transmute(vmulxq_lane_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxq_laneq_f64() { + let a: f64x2 = f64x2::new(1., 2.); + let b: f64x2 = f64x2::new(2., 0.); + let e: f64x2 = f64x2::new(2., 4.); + let r: f64x2 = transmute(vmulxq_laneq_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxs_f32() { + let a: f32 = 2.; + let b: f32 = 3.; + let e: f32 = 6.; + let r: f32 = transmute(vmulxs_f32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxd_f64() { + let a: f64 = 2.; + let b: f64 = 3.; + let e: f64 = 6.; + let r: f64 = transmute(vmulxd_f64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxs_lane_f32() { + let a: f32 = 2.; + let b: f32x2 = f32x2::new(3., 0.); + let e: f32 = 6.; + let r: f32 = transmute(vmulxs_lane_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxs_laneq_f32() { + let a: f32 = 2.; + let b: f32x4 = f32x4::new(3., 0., 0., 0.); + let e: f32 = 6.; + let r: f32 = transmute(vmulxs_laneq_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxd_lane_f64() { + let a: f64 = 2.; + let b: f64 = 3.; + let e: f64 = 6.; + let r: f64 = transmute(vmulxd_lane_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulxd_laneq_f64() { + let a: f64 = 2.; + let b: f64x2 = f64x2::new(3., 0.); + let e: f64 = 6.; + let r: f64 = transmute(vmulxd_laneq_f64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vfma_f64() { let a: f64 = 2.0; diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs index 77a982f3268d..2af220dace30 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs @@ -108,9 +108,6 @@ extern "C" { #[link_name = "llvm.aarch64.neon.usqadd.v2i64"] fn vsqaddq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t; - #[link_name = "llvm.aarch64.neon.pmull64"] - fn vmull_p64_(a: i64, b: i64) -> int8x16_t; - #[link_name = "llvm.aarch64.neon.addp.v8i16"] fn vpaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t; #[link_name = "llvm.aarch64.neon.addp.v4i32"] @@ -1150,14 +1147,6 @@ pub unsafe fn vaddlvq_u8(a: uint8x16_t) -> u16 { vaddlvq_u8_(a) as u16 } -/// Polynomial multiply long -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(pmull))] -pub unsafe fn vmull_p64(a: p64, b: p64) -> p128 { - transmute(vmull_p64_(transmute(a), transmute(b))) -} - /// Vector add. #[inline] #[target_feature(enable = "neon")] @@ -3260,36 +3249,6 @@ mod tests { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vmull_p64() { - // FIXME: I've a hard time writing a test for this as the documentation - // from arm is a bit thin as to waht exactly it does - let a: i64 = 8; - let b: i64 = 7; - let e: i128 = 56; - let r: i128 = transmute(vmull_p64(transmute(a), transmute(b))); - assert_eq!(r, e); - - /* - let a: i64 = 5; - let b: i64 = 5; - let e: i128 = 25; - let r: i128 = transmute(vmull_p64(a, b)); - - assert_eq!(r, e); - let a: i64 = 6; - let b: i64 = 6; - let e: i128 = 36; - let r: i128 = transmute(vmull_p64(a, b)); - assert_eq!(r, e); - - let a: i64 = 7; - let b: i64 = 6; - let e: i128 = 42; - let r: i128 = transmute(vmull_p64(a, b)); - assert_eq!(r, e); - */ - } #[simd_test(enable = "neon")] unsafe fn test_vadd_f64() { let a = 1.; diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs index 7f7c8ffddc59..599325801232 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs @@ -5558,6 +5558,38 @@ pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { simd_mul(a, b) } +/// Polynomial multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))] +pub unsafe fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v8i8")] + fn vmul_p8_(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; + } +vmul_p8_(a, b) +} + +/// Polynomial multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))] +pub unsafe fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v16i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v16i8")] + fn vmulq_p8_(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; + } +vmulq_p8_(a, b) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -5578,6 +5610,346 @@ pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { simd_mul(a, b) } +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t { + simd_mul(a, vdup_n_s16(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t { + simd_mul(a, vdupq_n_s16(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t { + simd_mul(a, vdup_n_s32(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t { + simd_mul(a, vdupq_n_s32(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t { + simd_mul(a, vdup_n_u16(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t { + simd_mul(a, vdupq_n_u16(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t { + simd_mul(a, vdup_n_u32(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))] +pub unsafe fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t { + simd_mul(a, vdupq_n_u32(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))] +pub unsafe fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t { + simd_mul(a, vdup_n_f32(b)) +} + +/// Vector multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))] +pub unsafe fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t { + simd_mul(a, vdupq_n_f32(b)) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t { + static_assert_imm3!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + static_assert_imm3!(LANE); + simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t { + static_assert_imm3!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_lane_u16(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_laneq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + static_assert_imm3!(LANE); + simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_lane_u32(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_laneq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_lane_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmul_laneq_f32(a: float32x2_t, b: float32x4_t) -> float32x2_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_lane_f32(a: float32x4_t, b: float32x2_t) -> float32x4_t { + static_assert_imm1!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Floating-point multiply +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmulq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { + static_assert_imm2!(LANE); + simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + /// Signed multiply long #[inline] #[target_feature(enable = "neon")] @@ -5690,6 +6062,142 @@ pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t { vmull_p8_(a, b) } +/// Vector long multiply with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] +pub unsafe fn vmullh_n_s16(a: int16x4_t, b: i16) -> int32x4_t { + vmull_s16(a, vdup_n_s16(b)) +} + +/// Vector long multiply with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] +pub unsafe fn vmulls_n_s32(a: int32x2_t, b: i32) -> int64x2_t { + vmull_s32(a, vdup_n_s32(b)) +} + +/// Vector long multiply with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] +pub unsafe fn vmullh_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t { + vmull_u16(a, vdup_n_u16(b)) +} + +/// Vector long multiply with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] +pub unsafe fn vmulls_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t { + vmull_u32(a, vdup_n_u32(b)) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { + static_assert_imm2!(LANE); + vmull_s16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t { + static_assert_imm3!(LANE); + vmull_s16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { + static_assert_imm1!(LANE); + vmull_s32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t { + static_assert_imm2!(LANE); + vmull_s32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t { + static_assert_imm2!(LANE); + vmull_u16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t { + static_assert_imm3!(LANE); + vmull_u16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { + static_assert_imm1!(LANE); + vmull_u32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + +/// Vector long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vmull_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t { + static_assert_imm2!(LANE); + vmull_u32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32])) +} + /// Floating-point fused Multiply-Add to accumulator(vector) #[inline] #[target_feature(enable = "neon")] @@ -17013,6 +17521,24 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmul_p8() { + let a: i8x8 = i8x8::new(1, 3, 1, 3, 1, 3, 1, 3); + let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: i8x8 = i8x8::new(1, 6, 3, 12, 5, 10, 7, 24); + let r: i8x8 = transmute(vmul_p8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_p8() { + let a: i8x16 = i8x16::new(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3); + let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let e: i8x16 = i8x16::new(1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48); + let r: i8x16 = transmute(vmulq_p8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f32() { let a: f32x2 = f32x2::new(1.0, 2.0); @@ -17031,6 +17557,276 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmul_n_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16 = 2; + let e: i16x4 = i16x4::new(2, 4, 6, 8); + let r: i16x4 = transmute(vmul_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_n_s16() { + let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: i16 = 2; + let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + let r: i16x8 = transmute(vmulq_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_n_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32 = 2; + let e: i32x2 = i32x2::new(2, 4); + let r: i32x2 = transmute(vmul_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_n_s32() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i32 = 2; + let e: i32x4 = i32x4::new(2, 4, 6, 8); + let r: i32x4 = transmute(vmulq_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_n_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16 = 2; + let e: u16x4 = u16x4::new(2, 4, 6, 8); + let r: u16x4 = transmute(vmul_n_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_n_u16() { + let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: u16 = 2; + let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + let r: u16x8 = transmute(vmulq_n_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_n_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32 = 2; + let e: u32x2 = u32x2::new(2, 4); + let r: u32x2 = transmute(vmul_n_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_n_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32 = 2; + let e: u32x4 = u32x4::new(2, 4, 6, 8); + let r: u32x4 = transmute(vmulq_n_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_n_f32() { + let a: f32x2 = f32x2::new(1., 2.); + let b: f32 = 2.; + let e: f32x2 = f32x2::new(2., 4.); + let r: f32x2 = transmute(vmul_n_f32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_n_f32() { + let a: f32x4 = f32x4::new(1., 2., 3., 4.); + let b: f32 = 2.; + let e: f32x4 = f32x4::new(2., 4., 6., 8.); + let r: f32x4 = transmute(vmulq_n_f32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_lane_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i16x4 = i16x4::new(2, 4, 6, 8); + let r: i16x4 = transmute(vmul_lane_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_laneq_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i16x4 = i16x4::new(2, 4, 6, 8); + let r: i16x4 = transmute(vmul_laneq_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_lane_s16() { + let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + let r: i16x8 = transmute(vmulq_lane_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_laneq_s16() { + let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + let r: i16x8 = transmute(vmulq_laneq_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_lane_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x2 = i32x2::new(0, 2); + let e: i32x2 = i32x2::new(2, 4); + let r: i32x2 = transmute(vmul_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_laneq_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i32x2 = i32x2::new(2, 4); + let r: i32x2 = transmute(vmul_laneq_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_lane_s32() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i32x2 = i32x2::new(0, 2); + let e: i32x4 = i32x4::new(2, 4, 6, 8); + let r: i32x4 = transmute(vmulq_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_laneq_s32() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i32x4 = i32x4::new(2, 4, 6, 8); + let r: i32x4 = transmute(vmulq_laneq_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_lane_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16x4 = u16x4::new(0, 2, 0, 0); + let e: u16x4 = u16x4::new(2, 4, 6, 8); + let r: u16x4 = transmute(vmul_lane_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_laneq_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: u16x4 = u16x4::new(2, 4, 6, 8); + let r: u16x4 = transmute(vmul_laneq_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_lane_u16() { + let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: u16x4 = u16x4::new(0, 2, 0, 0); + let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + let r: u16x8 = transmute(vmulq_lane_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_laneq_u16() { + let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + let r: u16x8 = transmute(vmulq_laneq_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_lane_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32x2 = u32x2::new(0, 2); + let e: u32x2 = u32x2::new(2, 4); + let r: u32x2 = transmute(vmul_lane_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_laneq_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32x4 = u32x4::new(0, 2, 0, 0); + let e: u32x2 = u32x2::new(2, 4); + let r: u32x2 = transmute(vmul_laneq_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_lane_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x2 = u32x2::new(0, 2); + let e: u32x4 = u32x4::new(2, 4, 6, 8); + let r: u32x4 = transmute(vmulq_lane_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_laneq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(0, 2, 0, 0); + let e: u32x4 = u32x4::new(2, 4, 6, 8); + let r: u32x4 = transmute(vmulq_laneq_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_lane_f32() { + let a: f32x2 = f32x2::new(1., 2.); + let b: f32x2 = f32x2::new(2., 0.); + let e: f32x2 = f32x2::new(2., 4.); + let r: f32x2 = transmute(vmul_lane_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmul_laneq_f32() { + let a: f32x2 = f32x2::new(1., 2.); + let b: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x2 = f32x2::new(2., 4.); + let r: f32x2 = transmute(vmul_laneq_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_lane_f32() { + let a: f32x4 = f32x4::new(1., 2., 3., 4.); + let b: f32x2 = f32x2::new(2., 0.); + let e: f32x4 = f32x4::new(2., 4., 6., 8.); + let r: f32x4 = transmute(vmulq_lane_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulq_laneq_f32() { + let a: f32x4 = f32x4::new(1., 2., 3., 4.); + let b: f32x4 = f32x4::new(2., 0., 0., 0.); + let e: f32x4 = f32x4::new(2., 4., 6., 8.); + let r: f32x4 = transmute(vmulq_laneq_f32::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmull_s8() { let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); @@ -17094,6 +17890,114 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmullh_n_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16 = 2; + let e: i32x4 = i32x4::new(2, 4, 6, 8); + let r: i32x4 = transmute(vmullh_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulls_n_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32 = 2; + let e: i64x2 = i64x2::new(2, 4); + let r: i64x2 = transmute(vmulls_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmullh_n_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16 = 2; + let e: u32x4 = u32x4::new(2, 4, 6, 8); + let r: u32x4 = transmute(vmullh_n_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmulls_n_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32 = 2; + let e: u64x2 = u64x2::new(2, 4); + let r: u64x2 = transmute(vmulls_n_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_lane_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i32x4 = i32x4::new(2, 4, 6, 8); + let r: i32x4 = transmute(vmull_lane_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_laneq_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i32x4 = i32x4::new(2, 4, 6, 8); + let r: i32x4 = transmute(vmull_laneq_s16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_lane_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(2, 4); + let r: i64x2 = transmute(vmull_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_laneq_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i64x2 = i64x2::new(2, 4); + let r: i64x2 = transmute(vmull_laneq_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_lane_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16x4 = u16x4::new(0, 2, 0, 0); + let e: u32x4 = u32x4::new(2, 4, 6, 8); + let r: u32x4 = transmute(vmull_lane_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_laneq_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: u32x4 = u32x4::new(2, 4, 6, 8); + let r: u32x4 = transmute(vmull_laneq_u16::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_lane_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32x2 = u32x2::new(0, 2); + let e: u64x2 = u64x2::new(2, 4); + let r: u64x2 = transmute(vmull_lane_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_laneq_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32x4 = u32x4::new(0, 2, 0, 0); + let e: u64x2 = u64x2::new(2, 4); + let r: u64x2 = transmute(vmull_laneq_u32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vfma_f32() { let a: f32x2 = f32x2::new(2.0, 3.0); diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec index f0b7448cc30a..6dfe8b5ea8bb 100644 --- a/library/stdarch/crates/stdarch-gen/neon.spec +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -338,7 +338,7 @@ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, /// Signed compare bitwise Test bits nonzero name = vtst -multi_fn = simd_and, c:in_t +multi_fn = simd_and, c:in_t, a, b multi_fn = fixed, d:in_t multi_fn = simd_ne, c, transmute(d) a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX @@ -354,7 +354,7 @@ generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8 /// Unsigned compare bitwise Test bits nonzero name = vtst -multi_fn = simd_and, c:in_t +multi_fn = simd_and, c:in_t, a, b multi_fn = fixed, d:in_t multi_fn = simd_ne, c, transmute(d) a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX @@ -1864,6 +1864,18 @@ aarch64 = mul fn = simd_mul generate int*_t, uint*_t +/// Polynomial multiply +name = vmul +a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 +b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48 + +aarch64 = pmul +link-aarch64 = pmul._EXT_ +arm = vmul +link-arm = vmulp._EXT_ +generate poly8x8_t, poly8x16_t + /// Multiply name = vmul fn = simd_mul @@ -1877,6 +1889,108 @@ generate float64x*_t arm = vmul. generate float*_t +/// Vector multiply by scalar +name = vmul +out-n-suffix +multi_fn = simd_mul, a, {vdup-nout-noext, b} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2 +validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 + +arm = vmul +aarch64 = mul +generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t +generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t + +/// Vector multiply by scalar +name = vmul +out-n-suffix +multi_fn = simd_mul, a, {vdup-nout-noext, b} +a = 1., 2., 3., 4. +b = 2. +validate 2., 4., 6., 8. + +aarch64 = fmul +generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t + +arm = vmul +generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t + +/// Multiply +name = vmul +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 + +aarch64 = mul +arm = vmul +generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t +generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t +generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t +generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t + +/// Floating-point multiply +name = vmul +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_mul, a, {transmute--, {simd_extract, b, LANE as u32}} +a = 1., 2., 3., 4. +b = 2., 0., 0., 0. +n = 0 +validate 2., 4., 6., 8. + +aarch64 = fmul +generate float64x1_t, float64x1_t:float64x2_t:float64x1_t + +/// Floating-point multiply +name = vmul +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}} +a = 1., 2., 3., 4. +b = 2., 0., 0., 0. +n = 0 +validate 2., 4., 6., 8. + +aarch64 = fmul +generate float64x2_t:float64x1_t:float64x2_t, float64x2_t + +arm = vmul +generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t + +/// Floating-point multiply +name = vmuls_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_extract, b:f32, b, LANE as u32 +multi_fn = a * b +a = 1. +b = 2., 0., 0., 0. +n = 0 +validate 2. +aarch64 = fmul +generate f32:float32x2_t:f32, f32:float32x4_t:f32 + +/// Floating-point multiply +name = vmuld_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = simd_extract, b:f64, b, LANE as u32 +multi_fn = a * b +a = 1. +b = 2., 0. +n = 0 +validate 2. +aarch64 = fmul +generate f64:float64x1_t:f64, f64:float64x2_t:f64 + /// Signed multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 @@ -1941,6 +2055,21 @@ link-arm = vmullp._EXT_ link-aarch64 = pmull._EXT_ generate poly8x8_t:poly8x8_t:poly16x8_t +/// Polynomial multiply long +name = vmull +no-q +a = 15 +b = 3 +validate 17 +target = crypto + +aarch64 = pmull +link-aarch64 = pmull64:p64:p64:p64:int8x16_t +arm = vmull +link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t +//generate p64:p64:p128 + + /// Polynomial multiply long name = vmull_high no-q @@ -1955,6 +2084,144 @@ validate 9, 30, 11, 20, 13, 18, 15, 48 aarch64 = pmull generate poly8x16_t:poly8x16_t:poly16x8_t +/// Polynomial multiply long +name = vmull_high +no-q +multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1} +a = 1, 15 +b = 1, 3 +validate 17 +target = crypto + +aarch64 = pmull2 +//generate poly64x2_t:poly64x2_t:p128 + +/// Vector long multiply with scalar +name = vmull +n-suffix +multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} +a = 1, 2, 3, 4, 5, 6, 7, 8 +b = 2 +validate 2, 4, 6, 8, 10, 12, 14, 16 + +arm = vmull +aarch64 = smull +generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t +aarch64 = umull +generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t + +/// Vector long multiply by scalar +name = vmull_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}} +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 + +arm = vmull +aarch64 = smull +generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t +generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t +aarch64 = umull +generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t +generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t + +/// Multiply long +name = vmull_high_n +no-q +multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b} +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 2 +validate 18, 20, 22, 24, 26, 28, 30, 32 + +aarch64 = smull2 +generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t +aarch64 = umull2 +generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t + +/// Multiply long +name = vmull_high_lane +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}} +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +n = 1 +validate 18, 20, 22, 24, 26, 28, 30, 32 + +aarch64 = smull2 +generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t +generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t +aarch64 = umull2 +generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t +generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t + +/// Floating-point multiply extended +name = vmulx +a = 1., 2., 3., 4. +b = 2., 2., 2., 2. +validate 2., 4., 6., 8. + +aarch64 = fmulx +link-aarch64 = fmulx._EXT_ +generate float*_t, float64x*_t + +/// Floating-point multiply extended +name = vmulx +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmulx-in0-noext, a, {transmute--, {simd_extract, b, LANE as u32}} +a = 1. +b = 2., 0. +n = 0 +validate 2. + +aarch64 = fmulx +generate float64x1_t, float64x1_t:float64x2_t:float64x1_t + +/// Floating-point multiply extended +name = vmulx +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}} +a = 1., 2., 3., 4. +b = 2., 0., 0., 0. +n = 0 +validate 2., 4., 6., 8. + +aarch64 = fmulx +generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t +generate float64x2_t:float64x1_t:float64x2_t, float64x2_t + +/// Floating-point multiply extended +name = vmulx +a = 2. +b = 3. +validate 6. + +aarch64 = fmulx +link-aarch64 = fmulx._EXT_ +generate f32, f64 + +/// Floating-point multiply extended +name = vmulx +lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32} + +a = 2. +b = 3., 0., 0., 0. +n = 0 +validate 6. + +aarch64 = fmulx +generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64 + /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma a = 2.0, 3.0, 4.0, 5.0 @@ -2142,7 +2409,7 @@ generate uint32x4_t:u64 name = vsubhn no-q multi_fn = fixed, c:in_t -multi_fn = simd_cast, {simd_shr, {simd_sub}, transmute(c)} +multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)} a = MAX, MIN, 1, 1, MAX, MIN, 1, 1 b = 1, 0, 0, 0, 1, 0, 0, 0 fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs index 5a905d92fed5..3cb23074aa22 100644 --- a/library/stdarch/crates/stdarch-gen/src/main.rs +++ b/library/stdarch/crates/stdarch-gen/src/main.rs @@ -81,7 +81,7 @@ fn type_len(t: &str) -> usize { "poly64x1_t" => 1, "poly64x2_t" => 2, "i8" | "i16" | "i32" | "i64" | "u8" | "u16" | "u32" | "u64" | "f32" | "f64" | "p8" - | "p16" => 1, + | "p16" | "p64" | "p128" => 1, _ => panic!("unknown type: {}", t), } } @@ -324,16 +324,16 @@ fn type_to_noq_suffix(t: &str) -> &str { "int16x4_t" | "int16x8_t" | "i16" => "_s16", "int32x2_t" | "int32x4_t" | "i32" => "_s32", "int64x1_t" | "int64x2_t" | "i64" => "_s64", - "uint8x8_t" | "uint8x16_t" => "_u8", - "uint16x4_t" | "uint16x8_t" => "_u16", - "uint32x2_t" | "uint32x4_t" => "_u32", - "uint64x1_t" | "uint64x2_t" => "_u64", + "uint8x8_t" | "uint8x16_t" | "u8" => "_u8", + "uint16x4_t" | "uint16x8_t" | "u16" => "_u16", + "uint32x2_t" | "uint32x4_t" | "u32" => "_u32", + "uint64x1_t" | "uint64x2_t" | "u64" => "_u64", "float16x4_t" | "float16x8_t" => "_f16", "float32x2_t" | "float32x4_t" => "_f32", "float64x1_t" | "float64x2_t" => "_f64", "poly8x8_t" | "poly8x16_t" => "_p8", "poly16x4_t" | "poly16x8_t" => "_p16", - "poly64x1_t" | "poly64x2_t" => "_p64", + "poly64x1_t" | "poly64x2_t" | "p64" => "_p64", _ => panic!("unknown type: {}", t), } } @@ -347,6 +347,7 @@ enum Suffix { NSuffix, NoQNSuffix, OutSuffix, + OutNSuffix, Lane, In2, In2Lane, @@ -354,8 +355,10 @@ enum Suffix { #[derive(Clone, Copy)] enum TargetFeature { + Default, ArmV7, FPArmV8, + Crypto, } fn type_to_global_type(t: &str) -> &str { @@ -400,6 +403,8 @@ fn type_to_global_type(t: &str) -> &str { "f64" => "f64", "p8" => "p8", "p16" => "p16", + "p64" => "p64", + "p128" => "p128", _ => panic!("unknown type: {}", t), } } @@ -492,6 +497,10 @@ fn type_to_ext(t: &str) -> &str { "u16" => "v4i16", "u32" => "v2i32", "u64" => "v1i64", + "f32" => "f32", + "f64" => "f64", + "p64" => "p64", + "p128" => "p128", /* "poly64x1_t" => "i64x1", "poly64x2_t" => "i64x2", @@ -825,6 +834,7 @@ fn gen_aarch64( )], suffix: Suffix, para_num: i32, + target: TargetFeature, fixed: &Vec, multi_fn: &Vec, ) -> (String, String) { @@ -846,16 +856,20 @@ fn gen_aarch64( NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])), NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), + OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)), Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])), In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])), }; + let current_target = match target { + Default => "neon", + ArmV7 => "v7", + FPArmV8 => "fp-armv8,v8", + Crypto => "neon,crypto", + }; let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() { - panic!( - "[{}] Can't specify link and (multi) fn at the same time.", - name - ) + panic!("[{}] Can't specify link and fn at the same time.", name) } current_fn } else if link_aarch64.is_some() { @@ -872,7 +886,24 @@ fn gen_aarch64( let current_aarch64 = current_aarch64.clone().unwrap(); let mut ext_c = String::new(); let mut ext_c_const = String::new(); - if let Some(link_aarch64) = link_aarch64.clone() { + let mut link_t: Vec = vec![ + in_t[0].to_string(), + in_t[1].to_string(), + in_t[2].to_string(), + out_t.to_string(), + ]; + if let Some(mut link_aarch64) = link_aarch64.clone() { + if link_aarch64.contains(":") { + let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect(); + assert_eq!(links.len(), 5); + link_aarch64 = links[0].to_string(); + link_t = vec![ + links[1].clone(), + links[2].clone(), + links[3].clone(), + links[4].clone(), + ]; + } let ext = type_to_ext(in_t[0]); let ext2 = type_to_ext(out_t); let link_aarch64 = if link_aarch64.starts_with("llvm") { @@ -893,17 +924,17 @@ fn gen_aarch64( current_fn, match para_num { 1 => { - format!("a: {}", in_t[0]) + format!("a: {}", link_t[0]) } 2 => { - format!("a: {}, b: {}", in_t[0], in_t[1]) + format!("a: {}, b: {}", link_t[0], link_t[1]) } 3 => { - format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]) + format!("a: {}, b: {}, c: {}", link_t[0], link_t[1], link_t[2]) } _ => unimplemented!("unknown para_num"), }, - out_t + link_t[3] ); if const_aarch64.is_some() { ext_c_const = format!( @@ -998,6 +1029,11 @@ fn gen_aarch64( } else { String::new() }; + let trans: [&str; 2] = if link_t[3] != out_t { + ["transmute(", ")"] + } else { + ["", ""] + }; let call = if let Some(const_aarch64) = const_aarch64 { match para_num { 1 => format!( @@ -1033,16 +1069,16 @@ fn gen_aarch64( match (multi_calls.len(), para_num, fixed.len()) { (0, 1, 0) => format!( r#"pub unsafe fn {}{}(a: {}) -> {} {{ - {}{}(a) + {}{}{}(a){} }}"#, - name, const_declare, in_t[0], out_t, ext_c, current_fn, + name, const_declare, in_t[0], out_t, ext_c, trans[0], current_fn, trans[1] ), (0, 1, _) => { let fixed: Vec = fixed.iter().take(type_len(in_t[0])).cloned().collect(); format!( r#"pub unsafe fn {}{}(a: {}) -> {} {{ let b{}; - {}{}(a, transmute(b)) + {}{}{}(a, transmute(b)){} }}"#, name, const_declare, @@ -1050,14 +1086,16 @@ fn gen_aarch64( out_t, values(in_t[0], &fixed), ext_c, + trans[0], current_fn, + trans[1], ) } (0, 2, _) => format!( r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{ - {}{}(a, b) + {}{}{}(a, b){} }}"#, - name, const_declare, in_t[0], in_t[1], out_t, ext_c, current_fn, + name, const_declare, in_t[0], in_t[1], out_t, ext_c, trans[0], current_fn, trans[1], ), (0, 3, _) => format!( r#"pub unsafe fn {}{}(a: {}, b: {}, c: {}) -> {} {{ @@ -1090,11 +1128,11 @@ fn gen_aarch64( r#" {} #[inline] -#[target_feature(enable = "neon")] +#[target_feature(enable = "{}")] #[cfg_attr(test, assert_instr({}{}))]{} {} "#, - current_comment, current_aarch64, const_assert, const_legacy, call + current_comment, current_target, current_aarch64, const_assert, const_legacy, call ); let test = gen_test( @@ -1259,6 +1297,7 @@ fn gen_arm( NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])), NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), + OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)), Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])), In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])), @@ -1266,10 +1305,17 @@ fn gen_arm( let current_aarch64 = current_aarch64 .clone() .unwrap_or_else(|| current_arm.to_string()); - - let current_target = match target { + let current_target_aarch64 = match target { + Default => "neon", + ArmV7 => "neon", + FPArmV8 => "neon", + Crypto => "neon,crypto", + }; + let current_target_arm = match target { + Default => "v7", ArmV7 => "v7", FPArmV8 => "fp-armv8,v8", + Crypto => "crypto,v8", }; let current_fn = if let Some(current_fn) = current_fn.clone() { @@ -1292,9 +1338,57 @@ fn gen_arm( String::new() }; let mut ext_c = String::new(); - let mut ext_c_const_arm = String::new(); - let mut ext_c_const_aarch64 = String::new(); - if let (Some(link_arm), Some(link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) { + let mut ext_c_arm = if multi_fn.is_empty() { + String::new() + } else { + String::from( + r#" + "#, + ) + }; + let mut ext_c_aarch64 = if multi_fn.is_empty() { + String::new() + } else { + String::from( + r#" + "#, + ) + }; + let mut link_arm_t: Vec = vec![ + in_t[0].to_string(), + in_t[1].to_string(), + in_t[2].to_string(), + out_t.to_string(), + ]; + let mut link_aarch64_t: Vec = vec![ + in_t[0].to_string(), + in_t[1].to_string(), + in_t[2].to_string(), + out_t.to_string(), + ]; + if let (Some(mut link_arm), Some(mut link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) { + if link_arm.contains(":") { + let links: Vec<_> = link_arm.split(':').map(|v| v.to_string()).collect(); + assert_eq!(links.len(), 5); + link_arm = links[0].to_string(); + link_arm_t = vec![ + links[1].clone(), + links[2].clone(), + links[3].clone(), + links[4].clone(), + ]; + } + if link_aarch64.contains(":") { + let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect(); + assert_eq!(links.len(), 5); + link_aarch64 = links[0].to_string(); + link_aarch64_t = vec![ + links[1].clone(), + links[2].clone(), + links[3].clone(), + links[4].clone(), + ]; + } let ext = type_to_ext(in_t[0]); let ext2 = type_to_ext(out_t); let link_arm = if link_arm.starts_with("llvm") { @@ -1311,35 +1405,36 @@ fn gen_arm( link.push_str(&link_aarch64); link.replace("_EXT_", ext).replace("_EXT2_", ext2) }; - ext_c = format!( - r#"#[allow(improper_ctypes)] + if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] { + ext_c = format!( + r#"#[allow(improper_ctypes)] extern "C" {{ #[cfg_attr(target_arch = "arm", link_name = "{}")] #[cfg_attr(target_arch = "aarch64", link_name = "{}")] fn {}({}) -> {}; }} "#, - link_arm, - link_aarch64, - current_fn, - match para_num { - 1 => { - format!("a: {}", in_t[0]) - } - 2 => { - format!("a: {}, b: {}", in_t[0], in_t[1]) - } - 3 => { - format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]) - } - _ => unimplemented!("unknown para_num"), - }, - out_t - ); + link_arm, + link_aarch64, + current_fn, + match para_num { + 1 => { + format!("a: {}", in_t[0]) + } + 2 => { + format!("a: {}, b: {}", in_t[0], in_t[1]) + } + 3 => { + format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]) + } + _ => unimplemented!("unknown para_num"), + }, + out_t + ); + }; if const_arm.is_some() { - ext_c_const_arm = format!( - r#" - #[allow(improper_ctypes)] + ext_c_arm.push_str(&format!( + r#"#[allow(improper_ctypes)] extern "C" {{ #[cfg_attr(target_arch = "arm", link_name = "{}")] fn {}({}) -> {}; @@ -1363,12 +1458,39 @@ fn gen_arm( _ => unimplemented!("unknown para_num"), }, out_t - ); + )); + }; + if out_t != link_arm_t[3] { + ext_c_arm.push_str(&format!( + r#"#[allow(improper_ctypes)] + extern "C" {{ + #[cfg_attr(target_arch = "arm", link_name = "{}")] + fn {}({}) -> {}; + }} +"#, + link_arm, + current_fn, + match para_num { + 1 => { + format!("a: {}", link_arm_t[0]) + } + 2 => { + format!("a: {}, b: {}", link_arm_t[0], link_arm_t[1]) + } + 3 => { + format!( + "a: {}, b: {}, c: {}", + link_arm_t[0], link_arm_t[1], link_arm_t[2] + ) + } + _ => unimplemented!("unknown para_num"), + }, + link_arm_t[3] + )); } if const_aarch64.is_some() { - ext_c_const_aarch64 = format!( - r#" - #[allow(improper_ctypes)] + ext_c_aarch64.push_str(&format!( + r#"#[allow(improper_ctypes)] extern "C" {{ #[cfg_attr(target_arch = "aarch64", link_name = "{}")] fn {}({}) -> {}; @@ -1389,7 +1511,35 @@ fn gen_arm( _ => unimplemented!("unknown para_num"), }, out_t - ); + )); + } + if out_t != link_aarch64_t[3] { + ext_c_aarch64.push_str(&format!( + r#"#[allow(improper_ctypes)] + extern "C" {{ + #[cfg_attr(target_arch = "aarch64", link_name = "{}")] + fn {}({}) -> {}; + }} +"#, + link_aarch64, + current_fn, + match para_num { + 1 => { + format!("a: {}", link_aarch64_t[0]) + } + 2 => { + format!("a: {}, b: {}", link_aarch64_t[0], link_aarch64_t[1]) + } + 3 => { + format!( + "a: {}, b: {}, c: {}", + link_aarch64_t[0], link_aarch64_t[1], link_aarch64_t[2] + ) + } + _ => unimplemented!("unknown para_num"), + }, + link_aarch64_t[3] + )); } }; let multi_calls = if !multi_fn.is_empty() { @@ -1430,6 +1580,11 @@ fn gen_arm( } else { String::new() }; + let trans: [&str; 2] = if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] { + ["", ""] + } else { + ["transmute(", ")"] + }; let call = match (multi_calls.len(), para_num, fixed.len()) { (0, 1, 0) => format!( r#"pub unsafe fn {}{}(a: {}) -> {} {{ @@ -1485,7 +1640,7 @@ fn gen_arm( ), (_, _, _) => String::new(), }; - let call_const_arm = if let Some(const_arm) = const_arm { + let call_arm = if let Some(const_arm) = const_arm { let const_arm = const_arm.replace("ttn", type_to_native_type(in_t[1])); let mut cnt = String::from(in_t[1]); cnt.push_str("("); @@ -1501,20 +1656,61 @@ fn gen_arm( r#"pub unsafe fn {}{}(a: {}) -> {} {{ {}{}{}(a, {}) }}"#, - name, const_declare, in_t[0], out_t, multi_calls, ext_c_const_arm, current_fn, cnt + name, const_declare, in_t[0], out_t, multi_calls, ext_c_arm, current_fn, cnt ), 2 => format!( - r#"pub unsafe fn {}{}(a: {}) -> {} {{ + r#"pub unsafe fn {}{}(a: {}, b:{}) -> {} {{ {}{}{}(a, b, {}) }}"#, - name, const_declare, in_t[0], out_t, multi_calls, ext_c_const_arm, current_fn, cnt + name, + const_declare, + in_t[0], + in_t[1], + out_t, + multi_calls, + ext_c_arm, + current_fn, + cnt + ), + _ => String::new(), + } + } else if out_t != link_arm_t[3] { + match para_num { + 1 => format!( + r#"pub unsafe fn {}{}(a: {}) -> {} {{ + {}{}{}{}(a){} +}}"#, + name, + const_declare, + in_t[0], + out_t, + multi_calls, + ext_c_arm, + trans[0], + current_fn, + trans[1] + ), + 2 => format!( + r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{ + {}{}{}{}(transmute(a), transmute(b)){} +}}"#, + name, + const_declare, + in_t[0], + in_t[1], + out_t, + multi_calls, + ext_c_arm, + trans[0], + current_fn, + trans[1], ), _ => String::new(), } } else { String::new() }; - let call_const_aarch64 = if let Some(const_aarch64) = const_aarch64 { + let call_aarch64 = if let Some(const_aarch64) = const_aarch64 { match para_num { 1 => format!( r#"pub unsafe fn {}{}(a: {}) -> {} {{ @@ -1525,55 +1721,94 @@ fn gen_arm( in_t[0], out_t, multi_calls, - ext_c_const_aarch64, + ext_c_aarch64, current_fn, const_aarch64 ), 2 => format!( - r#"pub unsafe fn {}{}(a: {}) -> {} {{ + r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{ {}{}{}(a, b, {}) +}}"#, + name, + const_declare, + in_t[0], + in_t[1], + out_t, + multi_calls, + ext_c_aarch64, + current_fn, + const_aarch64 + ), + _ => String::new(), + } + } else if out_t != link_aarch64_t[3] { + match para_num { + 1 => format!( + r#"pub unsafe fn {}{}(a: {}) -> {} {{ + {}{}{}{}(a){} }}"#, name, const_declare, in_t[0], out_t, multi_calls, - ext_c_const_aarch64, + ext_c_aarch64, + trans[0], current_fn, - const_aarch64 + trans[1], + ), + 2 => format!( + r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{ + {}{}{}{}(a, b){} +}}"#, + name, + const_declare, + in_t[0], + in_t[1], + out_t, + multi_calls, + ext_c_aarch64, + trans[0], + current_fn, + trans[1], ), _ => String::new(), } } else { String::new() }; - let function = if const_arm.is_some() && const_aarch64.is_some() { + let function = if (const_arm.is_some() && const_aarch64.is_some()) + || out_t != link_arm_t[3] + || out_t != link_aarch64_t[3] + { format!( r#" {} #[inline] #[cfg(target_arch = "arm")] -#[target_feature(enable = "neon,v7")] +#[target_feature(enable = "neon,{}")] #[cfg_attr(all(test, target_arch = "arm"), assert_instr({}{}))]{} {} {} #[inline] #[cfg(target_arch = "aarch64")] -#[target_feature(enable = "neon")] +#[target_feature(enable = "{}")] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({}{}))]{} {} "#, current_comment, + current_target_arm, expand_intrinsic(¤t_arm, in_t[1]), const_assert, const_legacy, - call_const_arm, + call_arm, current_comment, + current_target_aarch64, expand_intrinsic(¤t_aarch64, in_t[1]), const_assert, const_legacy, - call_const_aarch64, + call_aarch64, ) } else { format!( @@ -1587,7 +1822,7 @@ fn gen_arm( {} "#, current_comment, - current_target, + current_target_arm, expand_intrinsic(¤t_arm, in_t[1]), const_assert, expand_intrinsic(¤t_aarch64, in_t[1]), @@ -1755,6 +1990,7 @@ fn get_call( let len = match &*fn_format[1] { "out_len" => type_len(out_t), "in_len" => type_len(in_t[1]), + "in0_len" => type_len(in_t[0]), "halflen" => type_len(in_t[1]) / 2, _ => 0, }; @@ -2003,6 +2239,8 @@ fn get_call( fn_name.push_str(type_to_n_suffix(in_t[1])); } else if fn_format[1] == "out" { fn_name.push_str(type_to_suffix(out_t)); + } else if fn_format[1] == "in0" { + fn_name.push_str(type_to_suffix(in_t[0])); } else if fn_format[1] == "in2" { fn_name.push_str(type_to_suffix(in_t[2])); } else if fn_format[1] == "signed" { @@ -2028,6 +2266,8 @@ fn get_call( fn_name.push_str(&(type_len(in_t[1]) / 2).to_string()); } else if fn_format[1] == "nout" { fn_name.push_str(type_to_n_suffix(out_t)); + } else if fn_format[1] == "nin0" { + fn_name.push_str(type_to_n_suffix(in_t[0])); } else if fn_format[1] == "nsigned" { fn_name.push_str(type_to_n_suffix(type_to_signed(in_t[1]))); } else if fn_format[1] == "in_ntt" { @@ -2063,7 +2303,7 @@ fn get_call( } } if param_str.is_empty() { - param_str.push_str("a, b"); + return fn_name; } let fn_str = if let Some((re_name, re_type)) = re.clone() { format!( @@ -2108,7 +2348,7 @@ fn main() -> io::Result<()> { Vec, )> = Vec::new(); let mut multi_fn: Vec = Vec::new(); - let mut target: TargetFeature = ArmV7; + let mut target: TargetFeature = Default; // // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY @@ -2189,7 +2429,7 @@ mod test { fixed = Vec::new(); n = None; multi_fn = Vec::new(); - target = ArmV7; + target = Default; } else if line.starts_with("//") { } else if line.starts_with("name = ") { current_name = Some(String::from(&line[7..])); @@ -2211,6 +2451,8 @@ mod test { suffix = NoQDouble; } else if line.starts_with("n-suffix") { suffix = NSuffix; + } else if line.starts_with("out-n-suffix") { + suffix = OutNSuffix; } else if line.starts_with("noq-n-suffix") { suffix = NoQNSuffix; } else if line.starts_with("out-suffix") { @@ -2245,10 +2487,12 @@ mod test { } else if line.starts_with("target = ") { target = match Some(String::from(&line[9..])) { Some(input) => match input.as_str() { + "v7" => ArmV7, "fp-armv8" => FPArmV8, - _ => ArmV7, + "crypto" => Crypto, + _ => Default, }, - _ => ArmV7, + _ => Default, } } else if line.starts_with("generate ") { let line = &line[9..]; @@ -2328,6 +2572,7 @@ mod test { ¤t_tests, suffix, para_num, + target, &fixed, &multi_fn, );