Support AArch32 Neon dotprod intrinsics.
Note that the feature detection requires a recent Linux kernel (v6.2).
This commit is contained in:
parent
1e15fa3f0a
commit
a9fecd8456
7 changed files with 305 additions and 236 deletions
|
|
@ -10557,80 +10557,7 @@ pub unsafe fn vcmlaq_rot270_laneq_f32<const LANE: i32>(a: float32x4_t, b: float3
|
|||
vcmlaq_rot270_f32(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(sdot))]
|
||||
pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
|
||||
fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
|
||||
}
|
||||
vdot_s32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(sdot))]
|
||||
pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
|
||||
fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
|
||||
}
|
||||
vdotq_s32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(udot))]
|
||||
pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
|
||||
fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
|
||||
}
|
||||
vdot_u32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(udot))]
|
||||
pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
|
||||
fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
|
||||
}
|
||||
vdotq_u32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdot_s32(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32)
|
||||
#[inline]
|
||||
|
|
@ -10639,24 +10566,12 @@ pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x
|
|||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdot_s32(a, b, c)
|
||||
let c: int32x4_t = transmute(c);
|
||||
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vdot_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdotq_s32(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32)
|
||||
#[inline]
|
||||
|
|
@ -10665,24 +10580,12 @@ pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int
|
|||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdotq_s32(a, b, c)
|
||||
let c: int32x4_t = transmute(c);
|
||||
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vdotq_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(udot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdot_u32(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32)
|
||||
#[inline]
|
||||
|
|
@ -10691,24 +10594,12 @@ pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uin
|
|||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdot_u32(a, b, c)
|
||||
let c: uint32x4_t = transmute(c);
|
||||
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vdot_u32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(test, assert_instr(udot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdotq_u32(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32)
|
||||
#[inline]
|
||||
|
|
@ -10717,8 +10608,9 @@ pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: u
|
|||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 2);
|
||||
let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
|
||||
vdotq_u32(a, b, c)
|
||||
let c: uint32x4_t = transmute(c);
|
||||
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vdotq_u32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Maximum (vector)
|
||||
|
|
@ -23759,122 +23651,42 @@ mod test {
|
|||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_s32() {
|
||||
let a: i32x2 = i32x2::new(1, 2);
|
||||
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x2 = i32x2::new(31, 176);
|
||||
let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_s32() {
|
||||
let a: i32x4 = i32x4::new(1, 2, 1, 2);
|
||||
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x4 = i32x4::new(31, 176, 31, 176);
|
||||
let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_u32() {
|
||||
let a: u32x2 = u32x2::new(1, 2);
|
||||
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x2 = u32x2::new(31, 176);
|
||||
let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_u32() {
|
||||
let a: u32x4 = u32x4::new(1, 2, 1, 2);
|
||||
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x4 = u32x4::new(31, 176, 31, 176);
|
||||
let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_lane_s32() {
|
||||
let a: i32x2 = i32x2::new(1, 2);
|
||||
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x2 = i32x2::new(31, 72);
|
||||
let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_laneq_s32() {
|
||||
let a: i32x2 = i32x2::new(1, 2);
|
||||
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x2 = i32x2::new(31, 72);
|
||||
let e: i32x2 = i32x2::new(29, 72);
|
||||
let r: i32x2 = transmute(vdot_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_lane_s32() {
|
||||
let a: i32x4 = i32x4::new(1, 2, 1, 2);
|
||||
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x4 = i32x4::new(31, 72, 31, 72);
|
||||
let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_laneq_s32() {
|
||||
let a: i32x4 = i32x4::new(1, 2, 1, 2);
|
||||
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x4 = i32x4::new(31, 72, 31, 72);
|
||||
let e: i32x4 = i32x4::new(29, 72, 31, 72);
|
||||
let r: i32x4 = transmute(vdotq_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_lane_u32() {
|
||||
let a: u32x2 = u32x2::new(1, 2);
|
||||
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x2 = u32x2::new(31, 72);
|
||||
let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_laneq_u32() {
|
||||
let a: u32x2 = u32x2::new(1, 2);
|
||||
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x2 = u32x2::new(31, 72);
|
||||
let e: u32x2 = u32x2::new(285, 72);
|
||||
let r: u32x2 = transmute(vdot_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_lane_u32() {
|
||||
let a: u32x4 = u32x4::new(1, 2, 1, 2);
|
||||
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x4 = u32x4::new(31, 72, 31, 72);
|
||||
let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_laneq_u32() {
|
||||
let a: u32x4 = u32x4::new(1, 2, 1, 2);
|
||||
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x4 = u32x4::new(31, 72, 31, 72);
|
||||
let e: u32x4 = u32x4::new(285, 72, 31, 72);
|
||||
let r: u32x4 = transmute(vdotq_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18837,6 +18837,142 @@ pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
|
|||
simd_sub(c, d)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (vector)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))]
|
||||
pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v2i32.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
|
||||
fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
|
||||
}
|
||||
vdot_s32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (vector)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))]
|
||||
pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v4i32.v16i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
|
||||
fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
|
||||
}
|
||||
vdotq_s32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (vector)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))]
|
||||
pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v2i32.v8i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
|
||||
fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
|
||||
}
|
||||
vdot_u32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (vector)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))]
|
||||
pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
|
||||
#[allow(improper_ctypes)]
|
||||
extern "unadjusted" {
|
||||
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v4i32.v16i8")]
|
||||
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
|
||||
fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
|
||||
}
|
||||
vdotq_u32_(a, b, c)
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: int32x2_t = transmute(c);
|
||||
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vdot_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: int32x2_t = transmute(c);
|
||||
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vdotq_s32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: uint32x2_t = transmute(c);
|
||||
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
|
||||
vdot_u32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Dot product arithmetic (indexed)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon,dotprod")]
|
||||
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
|
||||
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
|
||||
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
|
||||
static_assert_uimm_bits!(LANE, 1);
|
||||
let c: uint32x2_t = transmute(c);
|
||||
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
|
||||
vdotq_u32(a, b, transmute(c))
|
||||
}
|
||||
|
||||
/// Maximum (vector)
|
||||
///
|
||||
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s8)
|
||||
|
|
@ -39239,6 +39375,86 @@ mod test {
|
|||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_s32() {
|
||||
let a: i32x2 = i32x2::new(1, 2);
|
||||
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x2 = i32x2::new(31, 176);
|
||||
let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_s32() {
|
||||
let a: i32x4 = i32x4::new(1, 2, 1, 2);
|
||||
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x4 = i32x4::new(31, 176, 31, 176);
|
||||
let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_u32() {
|
||||
let a: u32x2 = u32x2::new(1, 2);
|
||||
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x2 = u32x2::new(31, 176);
|
||||
let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_u32() {
|
||||
let a: u32x4 = u32x4::new(1, 2, 1, 2);
|
||||
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x4 = u32x4::new(31, 176, 31, 176);
|
||||
let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_lane_s32() {
|
||||
let a: i32x2 = i32x2::new(1, 2);
|
||||
let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x2 = i32x2::new(29, 72);
|
||||
let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_lane_s32() {
|
||||
let a: i32x4 = i32x4::new(1, 2, 1, 2);
|
||||
let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: i32x4 = i32x4::new(29, 72, 31, 72);
|
||||
let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdot_lane_u32() {
|
||||
let a: u32x2 = u32x2::new(1, 2);
|
||||
let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x2 = u32x2::new(285, 72);
|
||||
let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon,dotprod")]
|
||||
unsafe fn test_vdotq_lane_u32() {
|
||||
let a: u32x4 = u32x4::new(1, 2, 1, 2);
|
||||
let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let e: u32x4 = u32x4::new(285, 72, 31, 72);
|
||||
let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "neon")]
|
||||
unsafe fn test_vmax_s8() {
|
||||
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
|
|
|
|||
|
|
@ -160,14 +160,6 @@ vcvtpq_s32_f32
|
|||
vcvtpq_u32_f32
|
||||
vcvtp_s32_f32
|
||||
vcvtp_u32_f32
|
||||
vdot_lane_s32
|
||||
vdot_lane_u32
|
||||
vdotq_lane_s32
|
||||
vdotq_lane_u32
|
||||
vdotq_s32
|
||||
vdotq_u32
|
||||
vdot_s32
|
||||
vdot_u32
|
||||
vqdmulh_lane_s16
|
||||
vqdmulh_lane_s32
|
||||
vqdmulhq_lane_s16
|
||||
|
|
|
|||
|
|
@ -22,5 +22,7 @@ features! {
|
|||
@FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2";
|
||||
/// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
|
||||
@FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm";
|
||||
/// FEAT_I8MM
|
||||
/// FEAT_I8MM (integer matrix multiplication, plus ASIMD support)
|
||||
@FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] dotprod: "dotprod";
|
||||
/// FEAT_DotProd (Vector Dot-Product - ASIMDDP)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
//
|
||||
// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h
|
||||
if let Ok(auxv) = auxvec::auxv() {
|
||||
enable_feature(&mut value, Feature::i8mm, bit::test(auxv.hwcap, 27));
|
||||
enable_feature(&mut value, Feature::dotprod, bit::test(auxv.hwcap, 24));
|
||||
enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12));
|
||||
enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1));
|
||||
enable_feature(&mut value, Feature::crc, bit::test(auxv.hwcap2, 4));
|
||||
|
|
@ -37,6 +39,12 @@ pub(crate) fn detect_features() -> cache::Initializer {
|
|||
Feature::neon,
|
||||
c.field("Features").has("neon") && !has_broken_neon(&c),
|
||||
);
|
||||
enable_feature(&mut value, Feature::i8mm, c.field("Features").has("i8mm"));
|
||||
enable_feature(
|
||||
&mut value,
|
||||
Feature::dotprod,
|
||||
c.field("Features").has("asimddp"),
|
||||
);
|
||||
enable_feature(&mut value, Feature::pmull, c.field("Features").has("pmull"));
|
||||
enable_feature(&mut value, Feature::crc, c.field("Features").has("crc32"));
|
||||
enable_feature(&mut value, Feature::aes, c.field("Features").has("aes"));
|
||||
|
|
|
|||
|
|
@ -20,11 +20,8 @@ fn all() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(
|
||||
target_arch = "arm",
|
||||
any(target_os = "linux", target_os = "android", target_os = "freebsd"),
|
||||
))]
|
||||
fn arm_linux_or_freebsd() {
|
||||
#[cfg(all(target_arch = "arm", any(target_os = "freebsd"),))]
|
||||
fn arm_freebsd() {
|
||||
println!("neon: {}", is_arm_feature_detected!("neon"));
|
||||
println!("pmull: {}", is_arm_feature_detected!("pmull"));
|
||||
println!("crc: {}", is_arm_feature_detected!("crc"));
|
||||
|
|
@ -32,6 +29,18 @@ fn arm_linux_or_freebsd() {
|
|||
println!("sha2: {}", is_arm_feature_detected!("sha2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android"),))]
|
||||
fn arm_linux() {
|
||||
println!("neon: {}", is_arm_feature_detected!("neon"));
|
||||
println!("pmull: {}", is_arm_feature_detected!("pmull"));
|
||||
println!("crc: {}", is_arm_feature_detected!("crc"));
|
||||
println!("aes: {}", is_arm_feature_detected!("aes"));
|
||||
println!("sha2: {}", is_arm_feature_detected!("sha2"));
|
||||
println!("dotprod: {}", is_arm_feature_detected!("dotprod"));
|
||||
println!("i8mm: {}", is_arm_feature_detected!("i8mm"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(
|
||||
target_arch = "aarch64",
|
||||
|
|
|
|||
|
|
@ -4723,7 +4723,7 @@ aarch64 = fcmla
|
|||
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
|
||||
generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
||||
|
||||
/// Dot product arithmetic
|
||||
/// Dot product arithmetic (vector)
|
||||
name = vdot
|
||||
out-suffix
|
||||
a = 1, 2, 1, 2
|
||||
|
|
@ -4732,35 +4732,65 @@ c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
|||
validate 31, 176, 31, 176
|
||||
target = dotprod
|
||||
|
||||
arm = vsdot
|
||||
aarch64 = sdot
|
||||
link-arm = sdot._EXT_._EXT3_
|
||||
link-aarch64 = sdot._EXT_._EXT3_
|
||||
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
|
||||
|
||||
arm = vudot
|
||||
aarch64 = udot
|
||||
link-arm = udot._EXT_._EXT3_
|
||||
link-aarch64 = udot._EXT_._EXT3_
|
||||
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
|
||||
|
||||
/// Dot product arithmetic
|
||||
/// Dot product arithmetic (indexed)
|
||||
name = vdot
|
||||
out-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE}
|
||||
multi_fn = vdot-out-noext, a, b, c
|
||||
multi_fn = transmute, c:merge4_t2, c
|
||||
multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
|
||||
multi_fn = vdot-out-noext, a, b, {transmute, c}
|
||||
a = 1, 2, 1, 2
|
||||
b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
n = 0
|
||||
validate 31, 72, 31, 72
|
||||
validate 29, 72, 31, 72
|
||||
target = dotprod
|
||||
|
||||
// Only AArch64 has the laneq forms.
|
||||
aarch64 = sdot
|
||||
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t
|
||||
generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
|
||||
generate int32x2_t:int8x8_t:int8x16_t:int32x2_t
|
||||
generate int32x4_t:int8x16_t:int8x16_t:int32x4_t
|
||||
|
||||
arm = vsdot
|
||||
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t
|
||||
generate int32x4_t:int8x16_t:int8x8_t:int32x4_t
|
||||
|
||||
/// Dot product arithmetic (indexed)
|
||||
name = vdot
|
||||
out-lane-suffixes
|
||||
constn = LANE
|
||||
multi_fn = static_assert_imm-in2_dot-LANE
|
||||
multi_fn = transmute, c:merge4_t2, c
|
||||
multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
|
||||
multi_fn = vdot-out-noext, a, b, {transmute, c}
|
||||
a = 1, 2, 1, 2
|
||||
b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
n = 0
|
||||
validate 285, 72, 31, 72
|
||||
target = dotprod
|
||||
|
||||
// Only AArch64 has the laneq forms.
|
||||
aarch64 = udot
|
||||
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
|
||||
generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
|
||||
generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
|
||||
generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
|
||||
|
||||
arm = vudot
|
||||
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t
|
||||
generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t
|
||||
|
||||
/// Maximum (vector)
|
||||
name = vmax
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue