Support AArch32 Neon dotprod intrinsics.

Note that the feature detection requires a recent Linux kernel (v6.2).
This commit is contained in:
Jacob Bramley 2023-05-31 15:08:51 +01:00 committed by Amanieu d'Antras
parent 1e15fa3f0a
commit a9fecd8456
7 changed files with 305 additions and 236 deletions

View file

@ -10557,80 +10557,7 @@ pub unsafe fn vcmlaq_rot270_laneq_f32<const LANE: i32>(a: float32x4_t, b: float3
vcmlaq_rot270_f32(a, b, c)
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(sdot))]
pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
}
vdot_s32_(a, b, c)
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(sdot))]
pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
}
vdotq_s32_(a, b, c)
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(udot))]
pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
}
vdot_u32_(a, b, c)
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(udot))]
pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
}
vdotq_u32_(a, b, c)
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
static_assert_uimm_bits!(LANE, 1);
let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdot_s32(a, b, c)
}
/// Dot product arithmetic
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32)
#[inline]
@ -10639,24 +10566,12 @@ pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t {
static_assert_uimm_bits!(LANE, 2);
let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdot_s32(a, b, c)
let c: int32x4_t = transmute(c);
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
vdot_s32(a, b, transmute(c))
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
static_assert_uimm_bits!(LANE, 1);
let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdotq_s32(a, b, c)
}
/// Dot product arithmetic
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32)
#[inline]
@ -10665,24 +10580,12 @@ pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
static_assert_uimm_bits!(LANE, 2);
let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdotq_s32(a, b, c)
let c: int32x4_t = transmute(c);
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
vdotq_s32(a, b, transmute(c))
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(udot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
static_assert_uimm_bits!(LANE, 1);
let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdot_u32(a, b, c)
}
/// Dot product arithmetic
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32)
#[inline]
@ -10691,24 +10594,12 @@ pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uin
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t {
static_assert_uimm_bits!(LANE, 2);
let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdot_u32(a, b, c)
let c: uint32x4_t = transmute(c);
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
vdot_u32(a, b, transmute(c))
}
/// Dot product arithmetic
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(test, assert_instr(udot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
static_assert_uimm_bits!(LANE, 1);
let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdotq_u32(a, b, c)
}
/// Dot product arithmetic
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32)
#[inline]
@ -10717,8 +10608,9 @@ pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: u
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
static_assert_uimm_bits!(LANE, 2);
let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
vdotq_u32(a, b, c)
let c: uint32x4_t = transmute(c);
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
vdotq_u32(a, b, transmute(c))
}
/// Maximum (vector)
@ -23759,122 +23651,42 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_s32() {
let a: i32x2 = i32x2::new(1, 2);
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x2 = i32x2::new(31, 176);
let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_s32() {
let a: i32x4 = i32x4::new(1, 2, 1, 2);
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x4 = i32x4::new(31, 176, 31, 176);
let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_u32() {
let a: u32x2 = u32x2::new(1, 2);
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x2 = u32x2::new(31, 176);
let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_u32() {
let a: u32x4 = u32x4::new(1, 2, 1, 2);
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x4 = u32x4::new(31, 176, 31, 176);
let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_lane_s32() {
let a: i32x2 = i32x2::new(1, 2);
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x2 = i32x2::new(31, 72);
let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_laneq_s32() {
let a: i32x2 = i32x2::new(1, 2);
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x2 = i32x2::new(31, 72);
let e: i32x2 = i32x2::new(29, 72);
let r: i32x2 = transmute(vdot_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_lane_s32() {
let a: i32x4 = i32x4::new(1, 2, 1, 2);
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x4 = i32x4::new(31, 72, 31, 72);
let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_laneq_s32() {
let a: i32x4 = i32x4::new(1, 2, 1, 2);
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x4 = i32x4::new(31, 72, 31, 72);
let e: i32x4 = i32x4::new(29, 72, 31, 72);
let r: i32x4 = transmute(vdotq_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_lane_u32() {
let a: u32x2 = u32x2::new(1, 2);
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x2 = u32x2::new(31, 72);
let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_laneq_u32() {
let a: u32x2 = u32x2::new(1, 2);
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8);
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x2 = u32x2::new(31, 72);
let e: u32x2 = u32x2::new(285, 72);
let r: u32x2 = transmute(vdot_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_lane_u32() {
let a: u32x4 = u32x4::new(1, 2, 1, 2);
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x4 = u32x4::new(31, 72, 31, 72);
let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_laneq_u32() {
let a: u32x4 = u32x4::new(1, 2, 1, 2);
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x4 = u32x4::new(31, 72, 31, 72);
let e: u32x4 = u32x4::new(285, 72, 31, 72);
let r: u32x4 = transmute(vdotq_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}

View file

@ -18837,6 +18837,142 @@ pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
simd_sub(c, d)
}
/// Dot product arithmetic (vector)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))]
pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v2i32.v8i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
}
vdot_s32_(a, b, c)
}
/// Dot product arithmetic (vector)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))]
pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v4i32.v16i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
}
vdotq_s32_(a, b, c)
}
/// Dot product arithmetic (vector)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))]
pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v2i32.v8i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
}
vdot_u32_(a, b, c)
}
/// Dot product arithmetic (vector)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))]
pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v4i32.v16i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
}
vdotq_u32_(a, b, c)
}
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
static_assert_uimm_bits!(LANE, 1);
let c: int32x2_t = transmute(c);
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
vdot_s32(a, b, transmute(c))
}
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
static_assert_uimm_bits!(LANE, 1);
let c: int32x2_t = transmute(c);
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
vdotq_s32(a, b, transmute(c))
}
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
static_assert_uimm_bits!(LANE, 1);
let c: uint32x2_t = transmute(c);
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
vdot_u32(a, b, transmute(c))
}
/// Dot product arithmetic (indexed)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
#[inline]
#[target_feature(enable = "neon,dotprod")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
static_assert_uimm_bits!(LANE, 1);
let c: uint32x2_t = transmute(c);
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
vdotq_u32(a, b, transmute(c))
}
/// Maximum (vector)
///
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s8)
@ -39239,6 +39375,86 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_s32() {
let a: i32x2 = i32x2::new(1, 2);
let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x2 = i32x2::new(31, 176);
let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_s32() {
let a: i32x4 = i32x4::new(1, 2, 1, 2);
let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x4 = i32x4::new(31, 176, 31, 176);
let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_u32() {
let a: u32x2 = u32x2::new(1, 2);
let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x2 = u32x2::new(31, 176);
let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_u32() {
let a: u32x4 = u32x4::new(1, 2, 1, 2);
let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x4 = u32x4::new(31, 176, 31, 176);
let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_lane_s32() {
let a: i32x2 = i32x2::new(1, 2);
let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x2 = i32x2::new(29, 72);
let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_lane_s32() {
let a: i32x4 = i32x4::new(1, 2, 1, 2);
let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: i32x4 = i32x4::new(29, 72, 31, 72);
let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdot_lane_u32() {
let a: u32x2 = u32x2::new(1, 2);
let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8);
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x2 = u32x2::new(285, 72);
let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon,dotprod")]
unsafe fn test_vdotq_lane_u32() {
let a: u32x4 = u32x4::new(1, 2, 1, 2);
let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let e: u32x4 = u32x4::new(285, 72, 31, 72);
let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmax_s8() {
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);

View file

@ -160,14 +160,6 @@ vcvtpq_s32_f32
vcvtpq_u32_f32
vcvtp_s32_f32
vcvtp_u32_f32
vdot_lane_s32
vdot_lane_u32
vdotq_lane_s32
vdotq_lane_u32
vdotq_s32
vdotq_u32
vdot_s32
vdot_u32
vqdmulh_lane_s16
vqdmulh_lane_s32
vqdmulhq_lane_s16

View file

@ -22,5 +22,7 @@ features! {
@FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2";
/// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
@FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm";
/// FEAT_I8MM
/// FEAT_I8MM (integer matrix multiplication, plus ASIMD support)
@FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] dotprod: "dotprod";
/// FEAT_DotProd (Vector Dot-Product - ASIMDDP)
}

View file

@ -17,6 +17,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
//
// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h
if let Ok(auxv) = auxvec::auxv() {
enable_feature(&mut value, Feature::i8mm, bit::test(auxv.hwcap, 27));
enable_feature(&mut value, Feature::dotprod, bit::test(auxv.hwcap, 24));
enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12));
enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1));
enable_feature(&mut value, Feature::crc, bit::test(auxv.hwcap2, 4));
@ -37,6 +39,12 @@ pub(crate) fn detect_features() -> cache::Initializer {
Feature::neon,
c.field("Features").has("neon") && !has_broken_neon(&c),
);
enable_feature(&mut value, Feature::i8mm, c.field("Features").has("i8mm"));
enable_feature(
&mut value,
Feature::dotprod,
c.field("Features").has("asimddp"),
);
enable_feature(&mut value, Feature::pmull, c.field("Features").has("pmull"));
enable_feature(&mut value, Feature::crc, c.field("Features").has("crc32"));
enable_feature(&mut value, Feature::aes, c.field("Features").has("aes"));

View file

@ -20,11 +20,8 @@ fn all() {
}
#[test]
#[cfg(all(
target_arch = "arm",
any(target_os = "linux", target_os = "android", target_os = "freebsd"),
))]
fn arm_linux_or_freebsd() {
#[cfg(all(target_arch = "arm", any(target_os = "freebsd"),))]
fn arm_freebsd() {
println!("neon: {}", is_arm_feature_detected!("neon"));
println!("pmull: {}", is_arm_feature_detected!("pmull"));
println!("crc: {}", is_arm_feature_detected!("crc"));
@ -32,6 +29,18 @@ fn arm_linux_or_freebsd() {
println!("sha2: {}", is_arm_feature_detected!("sha2"));
}
#[test]
#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android"),))]
fn arm_linux() {
println!("neon: {}", is_arm_feature_detected!("neon"));
println!("pmull: {}", is_arm_feature_detected!("pmull"));
println!("crc: {}", is_arm_feature_detected!("crc"));
println!("aes: {}", is_arm_feature_detected!("aes"));
println!("sha2: {}", is_arm_feature_detected!("sha2"));
println!("dotprod: {}", is_arm_feature_detected!("dotprod"));
println!("i8mm: {}", is_arm_feature_detected!("i8mm"));
}
#[test]
#[cfg(all(
target_arch = "aarch64",

View file

@ -4723,7 +4723,7 @@ aarch64 = fcmla
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
/// Dot product arithmetic
/// Dot product arithmetic (vector)
name = vdot
out-suffix
a = 1, 2, 1, 2
@ -4732,35 +4732,65 @@ c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
validate 31, 176, 31, 176
target = dotprod
arm = vsdot
aarch64 = sdot
link-arm = sdot._EXT_._EXT3_
link-aarch64 = sdot._EXT_._EXT3_
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
arm = vudot
aarch64 = udot
link-arm = udot._EXT_._EXT3_
link-aarch64 = udot._EXT_._EXT3_
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
/// Dot product arithmetic
/// Dot product arithmetic (indexed)
name = vdot
out-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_dot-LANE
multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE}
multi_fn = vdot-out-noext, a, b, c
multi_fn = transmute, c:merge4_t2, c
multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
multi_fn = vdot-out-noext, a, b, {transmute, c}
a = 1, 2, 1, 2
b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
n = 0
validate 31, 72, 31, 72
validate 29, 72, 31, 72
target = dotprod
// Only AArch64 has the laneq forms.
aarch64 = sdot
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t
generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
generate int32x2_t:int8x8_t:int8x16_t:int32x2_t
generate int32x4_t:int8x16_t:int8x16_t:int32x4_t
arm = vsdot
generate int32x2_t:int8x8_t:int8x8_t:int32x2_t
generate int32x4_t:int8x16_t:int8x8_t:int32x4_t
/// Dot product arithmetic (indexed)
name = vdot
out-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_dot-LANE
multi_fn = transmute, c:merge4_t2, c
multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
multi_fn = vdot-out-noext, a, b, {transmute, c}
a = 1, 2, 1, 2
b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
n = 0
validate 285, 72, 31, 72
target = dotprod
// Only AArch64 has the laneq forms.
aarch64 = udot
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
arm = vudot
generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t
generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t
/// Maximum (vector)
name = vmax