From 1e15fa3f0a492be3666cc6b9bfe4d82b2efa2c5c Mon Sep 17 00:00:00 2001 From: Jacob Bramley Date: Tue, 23 May 2023 15:08:57 +0100 Subject: [PATCH] Add support for AArch64 i8mm *dot intrinsics. This includes vsudot and vusdot, which perform mixed-signedness dot product operations. --- .../core_arch/src/aarch64/neon/generated.rs | 96 +++++++++ .../src/arm_shared/neon/generated.rs | 188 ++++++++++++++++++ .../crates/intrinsic-test/missing_aarch64.txt | 10 - .../crates/intrinsic-test/missing_arm.txt | 10 - library/stdarch/crates/stdarch-gen/neon.spec | 143 +++++++++++-- .../stdarch/crates/stdarch-gen/src/main.rs | 19 ++ 6 files changed, 430 insertions(+), 36 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs index 7c039d91b3c6..25c119cbe34d 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs @@ -8353,6 +8353,62 @@ pub unsafe fn vst4q_lane_f64(a: *mut f64, b: float64x2x4_t) { vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _) } +/// Dot product index form with unsigned and signed integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_laneq_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(test, assert_instr(usdot, LANE = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vusdot_laneq_s32(a: int32x2_t, b: uint8x8_t, c: int8x16_t) -> int32x2_t { + static_assert_uimm_bits!(LANE, 2); + let c: int32x4_t = transmute(c); + let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vusdot_s32(a, b, transmute(c)) +} + +/// Dot product index form with unsigned and signed integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_laneq_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(test, assert_instr(usdot, LANE = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vusdotq_laneq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t { + static_assert_uimm_bits!(LANE, 2); + let c: int32x4_t = transmute(c); + let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vusdotq_s32(a, b, transmute(c)) +} + +/// Dot product index form with signed and unsigned integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_laneq_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(test, assert_instr(sudot, LANE = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsudot_laneq_s32(a: int32x2_t, b: int8x8_t, c: uint8x16_t) -> int32x2_t { + static_assert_uimm_bits!(LANE, 2); + let c: uint32x4_t = transmute(c); + let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vusdot_s32(a, transmute(c), b) +} + +/// Dot product index form with signed and unsigned integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_laneq_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(test, assert_instr(sudot, LANE = 3))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsudotq_laneq_s32(a: int32x4_t, b: int8x16_t, c: uint8x16_t) -> int32x4_t { + static_assert_uimm_bits!(LANE, 2); + let c: uint32x4_t = transmute(c); + let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vusdotq_s32(a, transmute(c), b) +} + /// Multiply /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f64) @@ -22184,6 +22240,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusdot_laneq_s32() { + let a: i32x2 = i32x2::new(1000, -4200); + let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11); + let e: i32x2 = i32x2::new(-3420, -10140); + let r: i32x2 = transmute(vusdot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusdotq_laneq_s32() { + let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000); + let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250); + let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11); + let e: i32x4 = i32x4::new(-3420, -10140, -8460, -6980); + let r: i32x4 = transmute(vusdotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vsudot_laneq_s32() { + let a: i32x2 = i32x2::new(-2000, 4200); + let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250); + let e: i32x2 = i32x2::new(300, 2740); + let r: i32x2 = transmute(vsudot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vsudotq_laneq_s32() { + let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000); + let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11); + let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250); + let e: i32x4 = i32x4::new(300, 2740, -6220, -6980); + let r: i32x4 = transmute(vsudotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f64() { let a: f64 = 1.0; diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs index 9fef95c26c7c..6382607f9904 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs @@ -16775,6 +16775,106 @@ pub unsafe fn vst4q_lane_f32(a: *mut f32, b: float32x4x4_t) { vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _) } +/// Dot product vector form with unsigned and signed integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))] +pub unsafe fn vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v2i32.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v2i32.v8i8")] + fn vusdot_s32_(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t; + } +vusdot_s32_(a, b, c) +} + +/// Dot product vector form with unsigned and signed integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))] +pub unsafe fn vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v4i32.v16i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v4i32.v16i8")] + fn vusdotq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t; + } +vusdotq_s32_(a, b, c) +} + +/// Dot product index form with unsigned and signed integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_lane_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vusdot_lane_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t { + static_assert_uimm_bits!(LANE, 1); + let c: int32x2_t = transmute(c); + let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vusdot_s32(a, b, transmute(c)) +} + +/// Dot product index form with unsigned and signed integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vusdotq_lane_s32(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t { + static_assert_uimm_bits!(LANE, 1); + let c: int32x2_t = transmute(c); + let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vusdotq_s32(a, b, transmute(c)) +} + +/// Dot product index form with signed and unsigned integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_lane_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsudot_lane_s32(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t { + static_assert_uimm_bits!(LANE, 1); + let c: uint32x2_t = transmute(c); + let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vusdot_s32(a, transmute(c), b) +} + +/// Dot product index form with signed and unsigned integers +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32) +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsudotq_lane_s32(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t { + static_assert_uimm_bits!(LANE, 1); + let c: uint32x2_t = transmute(c); + let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vusdotq_s32(a, transmute(c), b) +} + /// Multiply /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s8) @@ -37823,6 +37923,94 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusdot_s32() { + let a: i32x2 = i32x2::new(1000, -4200); + let b: u8x8 = u8x8::new(100, 205, 110, 195, 120, 185, 130, 175); + let c: i8x8 = i8x8::new(0, 1, 2, 3, -1, -2, -3, -4); + let e: i32x2 = i32x2::new(2010, -5780); + let r: i32x2 = transmute(vusdot_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusdotq_s32() { + let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000); + let b: u8x16 = u8x16::new(100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135); + let c: i8x16 = i8x16::new(0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8); + let e: i32x4 = i32x4::new(2010, -5780, 2370, -1940); + let r: i32x4 = transmute(vusdotq_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusdot_lane_s32() { + let a: i32x2 = i32x2::new(1000, -4200); + let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let e: i32x2 = i32x2::new(2100, -2700); + let r: i32x2 = transmute(vusdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + + let a: i32x2 = i32x2::new(1000, -4200); + let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let e: i32x2 = i32x2::new(260, -5180); + let r: i32x2 = transmute(vusdot_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusdotq_lane_s32() { + let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000); + let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250); + let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let e: i32x4 = i32x4::new(2100, -2700, 900, 4300); + let r: i32x4 = transmute(vusdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + + let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000); + let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250); + let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let e: i32x4 = i32x4::new(260, -5180, -2220, 540); + let r: i32x4 = transmute(vusdotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vsudot_lane_s32() { + let a: i32x2 = i32x2::new(-2000, 4200); + let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let e: i32x2 = i32x2::new(-900, 3460); + let r: i32x2 = transmute(vsudot_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + + let a: i32x2 = i32x2::new(-2000, 4200); + let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3); + let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let e: i32x2 = i32x2::new(-500, 3220); + let r: i32x2 = transmute(vsudot_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vsudotq_lane_s32() { + let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000); + let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11); + let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let e: i32x4 = i32x4::new(-900, 3460, -3580, -2420); + let r: i32x4 = transmute(vsudotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + + let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000); + let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11); + let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170); + let e: i32x4 = i32x4::new(-500, 3220, -4460, -3940); + let r: i32x4 = transmute(vsudotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_s8() { let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2); diff --git a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt index b09d677aff02..85bdd6fb3934 100644 --- a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt +++ b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt @@ -12,16 +12,6 @@ vbfmlaltq_f32 vbfmlaltq_lane_f32 vbfmlaltq_laneq_f32 vbfmmlaq_f32 -vsudot_laneq_s32 -vsudot_lane_s32 -vsudotq_laneq_s32 -vsudotq_lane_s32 -vusdot_laneq_s32 -vusdot_lane_s32 -vusdotq_laneq_s32 -vusdotq_lane_s32 -vusdotq_s32 -vusdot_s32 # Missing from both Clang and stdarch diff --git a/library/stdarch/crates/intrinsic-test/missing_arm.txt b/library/stdarch/crates/intrinsic-test/missing_arm.txt index 3acc61678961..07524b67790f 100644 --- a/library/stdarch/crates/intrinsic-test/missing_arm.txt +++ b/library/stdarch/crates/intrinsic-test/missing_arm.txt @@ -12,16 +12,6 @@ vbfmlaltq_f32 vbfmlaltq_lane_f32 vbfmlaltq_laneq_f32 vbfmmlaq_f32 -vsudot_laneq_s32 -vsudot_lane_s32 -vsudotq_laneq_s32 -vsudotq_lane_s32 -vusdot_laneq_s32 -vusdot_lane_s32 -vusdotq_laneq_s32 -vusdotq_lane_s32 -vusdotq_s32 -vusdot_s32 # Implemented in Clang and stdarch for A64 only even though CSV claims A32 support __crc32d diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec index 86de0c328e0b..5aaa7305b58e 100644 --- a/library/stdarch/crates/stdarch-gen/neon.spec +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -3478,27 +3478,138 @@ link-arm = vst4lane._EXTpi8r_ const-arm = LANE generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void +/// Dot product vector form with unsigned and signed integers +name = vusdot +out-suffix +a = 1000, -4200, -1000, 2000 +b = 100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135 +c = 0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8 +aarch64 = usdot +arm = vusdot +target = i8mm + +// 1000 + (100, 205, 110, 195) . ( 0, 1, 2, 3) +// -4200 + (120, 185, 130, 175) . (-1, -2, -3, -4) +// ... +validate 2010, -5780, 2370, -1940 + +link-arm = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t +link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t +generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t + +link-arm = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t +link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t +generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t + +/// Dot product index form with unsigned and signed integers +name = vusdot +out-lane-suffixes +constn = LANE +aarch64 = usdot +arm = vusdot +target = i8mm +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, b, {transmute, c} +a = 1000, -4200, -1000, 2000 +b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 +c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 + +// 1000 + (100, 110, 120, 130) . (4, 3, 2, 1) +// -4200 + (140, 150, 160, 170) . (4, 3, 2, 1) +// ... +n = 0 +validate 2100, -2700, 900, 4300 + +// 1000 + (100, 110, 120, 130) . (0, -1, -2, -3) +// -4200 + (140, 150, 160, 170) . (0, -1, -2, -3) +// ... +n = 1 +validate 260, -5180, -2220, 540 + +generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t +generate int32x4_t:uint8x16_t:int8x8_t:int32x4_t + +/// Dot product index form with unsigned and signed integers +name = vusdot +out-lane-suffixes +constn = LANE +// Only AArch64 has the laneq forms. +aarch64 = usdot +target = i8mm +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, b, {transmute, c} +a = 1000, -4200, -1000, 2000 +b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 +c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 + +// 1000 + (100, 110, 120, 130) . (-4, -5, -6, -7) +// -4200 + (140, 150, 160, 170) . (-4, -5, -6, -7) +// ... +n = 3 +validate -3420, -10140, -8460, -6980 + +generate int32x2_t:uint8x8_t:int8x16_t:int32x2_t +generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t + /// Dot product index form with signed and unsigned integers name = vsudot out-lane-suffixes constn = LANE -multi_fn = static_assert_imm-in2_dot-LANE -multi_fn = simd_shuffle!, c:unsigned, c, c, {base-4-LANE} -multi_fn = vsudot-outlane-_, a, b, c -a = 1, 2, 1, 2 -b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 -c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 -n = 0 -validate 31, 72, 31, 72 -target = dotprod - aarch64 = sudot -link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t -// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot -//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t -link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t -// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot -//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t +arm = vsudot +target = i8mm + +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, {transmute, c}, b +a = -2000, 4200, -1000, 2000 +b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 +c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 + +// -2000 + (4, 3, 2, 1) . (100, 110, 120, 130) +// 4200 + (0, -1, -2, -3) . (100, 110, 120, 130) +// ... +n = 0 +validate -900, 3460, -3580, -2420 + +// -2000 + (4, 3, 2, 1) . (140, 150, 160, 170) +// 4200 + (0, -1, -2, -3) . (140, 150, 160, 170) +// ... +n = 1 +validate -500, 3220, -4460, -3940 + +generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t +generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t + +/// Dot product index form with signed and unsigned integers +name = vsudot +out-lane-suffixes +constn = LANE +// Only AArch64 has the laneq forms. +aarch64 = sudot +target = i8mm + +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32} +multi_fn = vusdot-out-noext, a, {transmute, c}, b +a = -2000, 4200, -1000, 2000 +b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 +c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 + +// -2000 + (4, 3, 2, 1) . (220, 230, 240, 250) +// 4200 + (0, -1, -2, -3) . (220, 230, 240, 250) +// ... +n = 3 +validate 300, 2740, -6220, -6980 + +generate int32x2_t:int8x8_t:uint8x16_t:int32x2_t +generate int32x4_t:int8x16_t:uint8x16_t:int32x4_t /// Multiply name = vmul diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs index 652aee88c893..39de2c1c4254 100644 --- a/library/stdarch/crates/stdarch-gen/src/main.rs +++ b/library/stdarch/crates/stdarch-gen/src/main.rs @@ -799,6 +799,19 @@ fn type_to_half(t: &str) -> &str { } } +fn type_with_merged_lanes(t: &str, elements_per_lane: usize) -> String { + assert_eq!(type_len(t) % elements_per_lane, 0); + let prefix_len = t + .find(|c: char| c.is_ascii_digit()) + .unwrap_or_else(|| t.len()); + format!( + "{prefix}{bits}x{len}_t", + prefix = &t[0..prefix_len], + bits = type_bits(t) * elements_per_lane, + len = type_len(t) / elements_per_lane + ) +} + fn asc(start: i32, len: usize) -> String { let mut s = String::from("["); for i in 0..len { @@ -2993,6 +3006,12 @@ fn get_call( re = Some((re_params[0].clone(), in_t[1].to_string())); } else if re_params[1] == "out_t" { re = Some((re_params[0].clone(), out_t.to_string())); + } else if re_params[1] == "out_unsigned" { + re = Some((re_params[0].clone(), type_to_unsigned(out_t).to_string())); + } else if re_params[1] == "out_signed" { + re = Some((re_params[0].clone(), type_to_signed(out_t).to_string())); + } else if re_params[1] == "merge4_t2" { + re = Some((re_params[0].clone(), type_with_merged_lanes(in_t[2], 4))); } else if re_params[1] == "half" { re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string())); } else if re_params[1] == "in_ntt" {