Add vmull, vmull_high, vmlal, vmlal_high, vmlsl, vmlsl_high neon instructions (#1091)

This commit is contained in:
Sparrow Li 2021-03-21 06:35:19 +08:00 committed by GitHub
parent ce1027d7d5
commit 63facc4b68
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 1029 additions and 6 deletions

View file

@ -1617,6 +1617,66 @@ pub unsafe fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
simd_add(a, simd_mul(b, c))
}
/// Signed multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2))]
pub unsafe fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let c: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
vmlal_s8(a, b, c)
}
/// Signed multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2))]
pub unsafe fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let c: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
vmlal_s16(a, b, c)
}
/// Signed multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlal2))]
pub unsafe fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
let c: int32x2_t = simd_shuffle2(c, c, [2, 3]);
vmlal_s32(a, b, c)
}
/// Unsigned multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2))]
pub unsafe fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let c: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
vmlal_u8(a, b, c)
}
/// Unsigned multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2))]
pub unsafe fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let c: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
vmlal_u16(a, b, c)
}
/// Unsigned multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlal2))]
pub unsafe fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
let c: uint32x2_t = simd_shuffle2(c, c, [2, 3]);
vmlal_u32(a, b, c)
}
/// Floating-point multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
@ -1633,6 +1693,66 @@ pub unsafe fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
simd_sub(a, simd_mul(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2))]
pub unsafe fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t {
let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let c: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
vmlsl_s8(a, b, c)
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2))]
pub unsafe fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let c: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
vmlsl_s16(a, b, c)
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smlsl2))]
pub unsafe fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
let c: int32x2_t = simd_shuffle2(c, c, [2, 3]);
vmlsl_s32(a, b, c)
}
/// Unsigned multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2))]
pub unsafe fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t {
let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let c: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]);
vmlsl_u8(a, b, c)
}
/// Unsigned multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2))]
pub unsafe fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let c: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]);
vmlsl_u16(a, b, c)
}
/// Unsigned multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umlsl2))]
pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
let c: uint32x2_t = simd_shuffle2(c, c, [2, 3]);
vmlsl_u32(a, b, c)
}
/// Multiply
#[inline]
#[target_feature(enable = "neon")]
@ -1649,6 +1769,76 @@ pub unsafe fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
simd_mul(a, b)
}
/// Signed multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smull2))]
pub unsafe fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
vmull_s8(a, b)
}
/// Signed multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smull2))]
pub unsafe fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
vmull_s16(a, b)
}
/// Signed multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(smull2))]
pub unsafe fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
vmull_s32(a, b)
}
/// Unsigned multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umull2))]
pub unsafe fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
vmull_u8(a, b)
}
/// Unsigned multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umull2))]
pub unsafe fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
vmull_u16(a, b)
}
/// Unsigned multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(umull2))]
pub unsafe fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
vmull_u32(a, b)
}
/// Polynomial multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(pmull))]
pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t {
let a: poly8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: poly8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
vmull_p8(a, b)
}
/// Divide
#[inline]
#[target_feature(enable = "neon")]
@ -3258,6 +3448,66 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_s8() {
let a: i16x8 = i16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
let c: i8x16 = i8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
let e: i16x8 = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r: i16x8 = transmute(vmlal_high_s8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_s16() {
let a: i32x4 = i32x4::new(8, 7, 6, 5);
let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let e: i32x4 = i32x4::new(8, 9, 10, 11);
let r: i32x4 = transmute(vmlal_high_s16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_s32() {
let a: i64x2 = i64x2::new(8, 7);
let b: i32x4 = i32x4::new(2, 2, 2, 2);
let c: i32x4 = i32x4::new(3, 3, 0, 1);
let e: i64x2 = i64x2::new(8, 9);
let r: i64x2 = transmute(vmlal_high_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_u8() {
let a: u16x8 = u16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
let c: u8x16 = u8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
let e: u16x8 = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r: u16x8 = transmute(vmlal_high_u8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_u16() {
let a: u32x4 = u32x4::new(8, 7, 6, 5);
let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let e: u32x4 = u32x4::new(8, 9, 10, 11);
let r: u32x4 = transmute(vmlal_high_u16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_high_u32() {
let a: u64x2 = u64x2::new(8, 7);
let b: u32x4 = u32x4::new(2, 2, 2, 2);
let c: u32x4 = u32x4::new(3, 3, 0, 1);
let e: u64x2 = u64x2::new(8, 9);
let r: u64x2 = transmute(vmlal_high_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmls_f64() {
let a: f64 = 6.;
@ -3278,6 +3528,66 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_s8() {
let a: i16x8 = i16x8::new(14, 15, 16, 17, 18, 19, 20, 21);
let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
let c: i8x16 = i8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
let e: i16x8 = i16x8::new(14, 13, 12, 11, 10, 9, 8, 7);
let r: i16x8 = transmute(vmlsl_high_s8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_s16() {
let a: i32x4 = i32x4::new(14, 15, 16, 17);
let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let e: i32x4 = i32x4::new(14, 13, 12, 11);
let r: i32x4 = transmute(vmlsl_high_s16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_s32() {
let a: i64x2 = i64x2::new(14, 15);
let b: i32x4 = i32x4::new(2, 2, 2, 2);
let c: i32x4 = i32x4::new(3, 3, 0, 1);
let e: i64x2 = i64x2::new(14, 13);
let r: i64x2 = transmute(vmlsl_high_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_u8() {
let a: u16x8 = u16x8::new(14, 15, 16, 17, 18, 19, 20, 21);
let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
let c: u8x16 = u8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7);
let e: u16x8 = u16x8::new(14, 13, 12, 11, 10, 9, 8, 7);
let r: u16x8 = transmute(vmlsl_high_u8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_u16() {
let a: u32x4 = u32x4::new(14, 15, 16, 17);
let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
let e: u32x4 = u32x4::new(14, 13, 12, 11);
let r: u32x4 = transmute(vmlsl_high_u16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_high_u32() {
let a: u64x2 = u64x2::new(14, 15);
let b: u32x4 = u32x4::new(2, 2, 2, 2);
let c: u32x4 = u32x4::new(3, 3, 0, 1);
let e: u64x2 = u64x2::new(14, 13);
let r: u64x2 = transmute(vmlsl_high_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmul_f64() {
let a: f64 = 1.0;
@ -3296,6 +3606,69 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_s8() {
let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
let b: i8x16 = i8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
let e: i16x8 = i16x8::new(9, 20, 11, 24, 13, 28, 15, 32);
let r: i16x8 = transmute(vmull_high_s8(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_s16() {
let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
let b: i16x8 = i16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
let e: i32x4 = i32x4::new(9, 20, 11, 24);
let r: i32x4 = transmute(vmull_high_s16(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_s32() {
let a: i32x4 = i32x4::new(1, 2, 9, 10);
let b: i32x4 = i32x4::new(1, 2, 1, 2);
let e: i64x2 = i64x2::new(9, 20);
let r: i64x2 = transmute(vmull_high_s32(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_u8() {
let a: u8x16 = u8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
let b: u8x16 = u8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
let e: u16x8 = u16x8::new(9, 20, 11, 24, 13, 28, 15, 32);
let r: u16x8 = transmute(vmull_high_u8(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_u16() {
let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
let b: u16x8 = u16x8::new(1, 2, 1, 2, 1, 2, 1, 2);
let e: u32x4 = u32x4::new(9, 20, 11, 24);
let r: u32x4 = transmute(vmull_high_u16(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_u32() {
let a: u32x4 = u32x4::new(1, 2, 9, 10);
let b: u32x4 = u32x4::new(1, 2, 1, 2);
let e: u64x2 = u64x2::new(9, 20);
let r: u64x2 = transmute(vmull_high_u32(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_high_p8() {
let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
let b: i8x16 = i8x16::new(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3);
let e: i16x8 = i16x8::new(9, 30, 11, 20, 13, 18, 15, 48);
let r: i16x8 = transmute(vmull_high_p8(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vdiv_f32() {
let a: f32x2 = f32x2::new(2.0, 6.0);

View file

@ -2121,6 +2121,66 @@ pub unsafe fn vmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
simd_add(a, simd_mul(b, c))
}
/// Signed multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
pub unsafe fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
simd_add(a, vmull_s8(b, c))
}
/// Signed multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
pub unsafe fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
simd_add(a, vmull_s16(b, c))
}
/// Signed multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
pub unsafe fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
simd_add(a, vmull_s32(b, c))
}
/// Unsigned multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
pub unsafe fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
simd_add(a, vmull_u8(b, c))
}
/// Unsigned multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
pub unsafe fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
simd_add(a, vmull_u16(b, c))
}
/// Unsigned multiply-add long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
pub unsafe fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
simd_add(a, vmull_u32(b, c))
}
/// Multiply-subtract from accumulator
#[inline]
#[target_feature(enable = "neon")]
@ -2261,6 +2321,66 @@ pub unsafe fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
simd_sub(a, simd_mul(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
pub unsafe fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
simd_sub(a, vmull_s8(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
pub unsafe fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
simd_sub(a, vmull_s16(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
pub unsafe fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
simd_sub(a, vmull_s32(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
pub unsafe fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
simd_sub(a, vmull_u8(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
pub unsafe fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
simd_sub(a, vmull_u16(b, c))
}
/// Signed multiply-subtract long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
pub unsafe fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
simd_sub(a, vmull_u32(b, c))
}
/// Saturating subtract
#[inline]
#[target_feature(enable = "neon")]
@ -3297,6 +3417,118 @@ pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
simd_mul(a, b)
}
/// Signed multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")]
fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t;
}
vmull_s8_(a, b)
}
/// Signed multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")]
fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
}
vmull_s16_(a, b)
}
/// Signed multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")]
fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
}
vmull_s32_(a, b)
}
/// Unsigned multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")]
fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
}
vmull_u8_(a, b)
}
/// Unsigned multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")]
fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
}
vmull_u16_(a, b)
}
/// Unsigned multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")]
fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
}
vmull_u32_(a, b)
}
/// Polynomial multiply long
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))]
pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
#[allow(improper_ctypes)]
extern "C" {
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")]
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")]
fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
}
vmull_p8_(a, b)
}
/// Subtract
#[inline]
#[target_feature(enable = "neon")]
@ -6105,6 +6337,66 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_s8() {
let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_s16() {
let a: i32x4 = i32x4::new(0, 1, 2, 3);
let b: i16x4 = i16x4::new(2, 2, 2, 2);
let c: i16x4 = i16x4::new(3, 3, 3, 3);
let e: i32x4 = i32x4::new(6, 7, 8, 9);
let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_s32() {
let a: i64x2 = i64x2::new(0, 1);
let b: i32x2 = i32x2::new(2, 2);
let c: i32x2 = i32x2::new(3, 3);
let e: i64x2 = i64x2::new(6, 7);
let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_u8() {
let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_u16() {
let a: u32x4 = u32x4::new(0, 1, 2, 3);
let b: u16x4 = u16x4::new(2, 2, 2, 2);
let c: u16x4 = u16x4::new(3, 3, 3, 3);
let e: u32x4 = u32x4::new(6, 7, 8, 9);
let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlal_u32() {
let a: u64x2 = u64x2::new(0, 1);
let b: u32x2 = u32x2::new(2, 2);
let c: u32x2 = u32x2::new(3, 3);
let e: u64x2 = u64x2::new(6, 7);
let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmls_s8() {
let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
@ -6245,6 +6537,66 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_s8() {
let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let r: i16x8 = transmute(vmlsl_s8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_s16() {
let a: i32x4 = i32x4::new(6, 7, 8, 9);
let b: i16x4 = i16x4::new(2, 2, 2, 2);
let c: i16x4 = i16x4::new(3, 3, 3, 3);
let e: i32x4 = i32x4::new(0, 1, 2, 3);
let r: i32x4 = transmute(vmlsl_s16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_s32() {
let a: i64x2 = i64x2::new(6, 7);
let b: i32x2 = i32x2::new(2, 2);
let c: i32x2 = i32x2::new(3, 3);
let e: i64x2 = i64x2::new(0, 1);
let r: i64x2 = transmute(vmlsl_s32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_u8() {
let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let r: u16x8 = transmute(vmlsl_u8(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_u16() {
let a: u32x4 = u32x4::new(6, 7, 8, 9);
let b: u16x4 = u16x4::new(2, 2, 2, 2);
let c: u16x4 = u16x4::new(3, 3, 3, 3);
let e: u32x4 = u32x4::new(0, 1, 2, 3);
let r: u32x4 = transmute(vmlsl_u16(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmlsl_u32() {
let a: u64x2 = u64x2::new(6, 7);
let b: u32x2 = u32x2::new(2, 2);
let c: u32x2 = u32x2::new(3, 3);
let e: u64x2 = u64x2::new(0, 1);
let r: u64x2 = transmute(vmlsl_u32(transmute(a), transmute(b), transmute(c)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vqsub_u8() {
let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
@ -6875,6 +7227,69 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_s8() {
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
let e: i16x8 = i16x8::new(1, 4, 3, 8, 5, 12, 7, 16);
let r: i16x8 = transmute(vmull_s8(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_s16() {
let a: i16x4 = i16x4::new(1, 2, 3, 4);
let b: i16x4 = i16x4::new(1, 2, 1, 2);
let e: i32x4 = i32x4::new(1, 4, 3, 8);
let r: i32x4 = transmute(vmull_s16(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_s32() {
let a: i32x2 = i32x2::new(1, 2);
let b: i32x2 = i32x2::new(1, 2);
let e: i64x2 = i64x2::new(1, 4);
let r: i64x2 = transmute(vmull_s32(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_u8() {
let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b: u8x8 = u8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
let e: u16x8 = u16x8::new(1, 4, 3, 8, 5, 12, 7, 16);
let r: u16x8 = transmute(vmull_u8(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_u16() {
let a: u16x4 = u16x4::new(1, 2, 3, 4);
let b: u16x4 = u16x4::new(1, 2, 1, 2);
let e: u32x4 = u32x4::new(1, 4, 3, 8);
let r: u32x4 = transmute(vmull_u16(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_u32() {
let a: u32x2 = u32x2::new(1, 2);
let b: u32x2 = u32x2::new(1, 2);
let e: u64x2 = u64x2::new(1, 4);
let r: u64x2 = transmute(vmull_u32(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmull_p8() {
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b: i8x8 = i8x8::new(1, 3, 1, 3, 1, 3, 1, 3);
let e: i16x8 = i16x8::new(1, 6, 3, 12, 5, 10, 7, 24);
let r: i16x8 = transmute(vmull_p8(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vsub_s8() {
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);

View file

@ -784,6 +784,60 @@ generate float64x*_t
arm = vmla.
generate float*_t
/// Signed multiply-add long
name = vmlal
multi_fn = simd_add, a, {vmull-self-noext, b, c}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
arm = vmlal.s
aarch64 = smlal
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
/// Unsigned multiply-add long
name = vmlal
multi_fn = simd_add, a, {vmull-self-noext, b, c}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
arm = vmlal.s
aarch64 = umlal
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
/// Signed multiply-add long
name = vmlal_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = vmlal-noqself-noext, a, b, c
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = smlal2
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
/// Unsigned multiply-add long
name = vmlal_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = vmlal-noqself-noext, a, b, c
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 8, 9, 10, 11, 12, 13, 14, 15
aarch64 = umlal2
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
/// Multiply-subtract from accumulator
name = vmls
multi_fn = simd_sub, a, {simd_mul, b, c}
@ -810,6 +864,60 @@ generate float64x*_t
arm = vmls.
generate float*_t
/// Signed multiply-subtract long
name = vmlsl
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
arm = vmlsl.s
aarch64 = smlsl
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
/// Signed multiply-subtract long
name = vmlsl
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
arm = vmlsl.s
aarch64 = umlsl
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
/// Signed multiply-subtract long
name = vmlsl_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = vmlsl-noqself-noext, a, b, c
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 14, 13, 12, 11, 10, 9, 8, 7
aarch64 = smlsl2
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
/// Unsigned multiply-subtract long
name = vmlsl_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = vmlsl-noqself-noext, a, b, c
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 14, 13, 12, 11, 10, 9, 8, 7
aarch64 = umlsl2
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
/// Saturating subtract
name = vqsub
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
@ -907,6 +1015,84 @@ generate float64x*_t
arm = vmul.
generate float*_t
/// Signed multiply long
name = vmull
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
arm = vmull.s
aarch64 = smull
link-arm = vmulls._EXT_
link-aarch64 = smull._EXT_
generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
/// Signed multiply long
name = vmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = vmull-noqself-noext, a, b
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 9, 20, 11, 24, 13, 28, 15, 32
aarch64 = smull2
generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
/// Unsigned multiply long
name = vmull
a = 1, 2, 3, 4, 5, 6, 7, 8
b = 1, 2, 1, 2, 1, 2, 1, 2
validate 1, 4, 3, 8, 5, 12, 7, 16
arm = vmull.s
aarch64 = umull
link-arm = vmullu._EXT_
link-aarch64 = umull._EXT_
generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
/// Unsigned multiply long
name = vmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = vmull-noqself-noext, a, b
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 9, 20, 11, 24, 13, 28, 15, 32
aarch64 = umull2
generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t
/// Polynomial multiply long
name = vmull
a = 1, 2, 3, 4, 5, 6, 7, 8
b = 1, 3, 1, 3, 1, 3, 1, 3
validate 1, 6, 3, 12, 5, 10, 7, 24
arm = vmull.s
aarch64 = pmull
link-arm = vmullp._EXT_
link-aarch64 = pmull._EXT_
generate poly8x8_t:poly8x8_t:poly16x8_t
/// Polynomial multiply long
name = vmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = vmull-noqself-noext, a, b
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
validate 9, 30, 11, 20, 13, 18, 15, 48
aarch64 = pmull
generate poly8x16_t:poly8x16_t:poly16x8_t
/// Divide
name = vdiv
fn = simd_div

View file

@ -75,6 +75,8 @@ fn type_len(t: &str) -> usize {
"float64x2_t" => 2,
"poly8x8_t" => 8,
"poly8x16_t" => 16,
"poly16x4_t" => 4,
"poly16x8_t" => 8,
"poly64x1_t" => 1,
"poly64x2_t" => 2,
_ => panic!("unknown type: {}", t),
@ -231,6 +233,8 @@ fn type_to_global_type(t: &str) -> &str {
"float64x2_t" => "f64x2",
"poly8x8_t" => "i8x8",
"poly8x16_t" => "i8x16",
"poly16x4_t" => "i16x4",
"poly16x8_t" => "i16x8",
"poly64x1_t" => "i64x1",
"poly64x2_t" => "i64x2",
_ => panic!("unknown type: {}", t),
@ -291,6 +295,10 @@ fn type_to_ext(t: &str) -> &str {
"float32x4_t" => "v4f32",
"float64x1_t" => "v1f64",
"float64x2_t" => "v2f64",
"poly8x8_t" => "v8i8",
"poly8x16_t" => "v16i8",
"poly16x4_t" => "v4i16",
"poly16x8_t" => "v8i16",
/*
"poly64x1_t" => "i64x1",
"poly64x2_t" => "i64x2",
@ -299,6 +307,24 @@ fn type_to_ext(t: &str) -> &str {
}
}
fn type_to_half(t: &str) -> &str {
match t {
"int8x16_t" => "int8x8_t",
"int16x8_t" => "int16x4_t",
"int32x4_t" => "int32x2_t",
"int64x2_t" => "int64x1_t",
"uint8x16_t" => "uint8x8_t",
"uint16x8_t" => "uint16x4_t",
"uint32x4_t" => "uint32x2_t",
"uint64x2_t" => "uint64x1_t",
"poly8x16_t" => "poly8x8_t",
"poly16x8_t" => "poly16x4_t",
"float32x4_t" => "float32x2_t",
"float64x2_t" => "float64x1_t",
_ => panic!("unknown half type for {}", t),
}
}
fn values(t: &str, vs: &[String]) -> String {
if vs.len() == 1 && !t.contains('x') {
format!(": {} = {}", t, vs[0])
@ -588,7 +614,7 @@ fn gen_aarch64(
in_t,
&out_t,
current_tests,
[type_len(in_t[0]), type_len(in_t[0]), type_len(in_t[0])],
[type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
type_len(out_t),
para_num,
);
@ -843,8 +869,8 @@ fn gen_arm(
{}
"#,
current_comment,
expand_intrinsic(&current_arm, in_t[0]),
expand_intrinsic(&current_aarch64, in_t[0]),
expand_intrinsic(&current_arm, in_t[1]),
expand_intrinsic(&current_aarch64, in_t[1]),
call,
);
let test = gen_test(
@ -887,6 +913,8 @@ fn expand_intrinsic(intr: &str, t: &str) -> String {
"float64x2_t" => "f64",
"poly8x8_t" => "i8",
"poly8x16_t" => "i8",
"poly16x4_t" => "i16",
"poly16x8_t" => "i16",
/*
"poly64x1_t" => "i64x1",
"poly64x2_t" => "i64x2",
@ -912,6 +940,10 @@ fn expand_intrinsic(intr: &str, t: &str) -> String {
"uint32x4_t" => "u32",
"uint64x1_t" => "u64",
"uint64x2_t" => "u64",
"poly8x8_t" => "p8",
"poly8x16_t" => "p8",
"poly16x4_t" => "p16",
"poly16x8_t" => "p16",
"float16x4_t" => "f16",
"float16x8_t" => "f16",
"float32x2_t" => "f32",
@ -978,11 +1010,13 @@ fn get_call(
} else if s.contains(':') {
let re_params: Vec<_> = s.split(':').map(|v| v.to_string()).collect();
if re_params[1] == "" {
re = Some((re_params[0].clone(), in_t[0].to_string()));
re = Some((re_params[0].clone(), in_t[1].to_string()));
} else if re_params[1] == "in_t" {
re = Some((re_params[0].clone(), in_t[0].to_string()));
re = Some((re_params[0].clone(), in_t[1].to_string()));
} else if re_params[1] == "out_t" {
re = Some((re_params[0].clone(), out_t.to_string()));
} else if re_params[1] == "half" {
re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string()));
} else {
re = Some((re_params[0].clone(), re_params[1].clone()));
}
@ -996,9 +1030,20 @@ fn get_call(
}
if fn_name == "fixed" {
let (re_name, re_type) = re.unwrap();
let fixed: Vec<String> = fixed.iter().take(type_len(in_t[0])).cloned().collect();
let fixed: Vec<String> = fixed.iter().take(type_len(in_t[1])).cloned().collect();
return format!(r#"let {}{};"#, re_name, values(&re_type, &fixed));
}
if fn_name == "fixed-half-right" {
let fixed: Vec<String> = fixed.iter().take(type_len(in_t[1])).cloned().collect();
let half = fixed[type_len(in_t[1]) / 2..]
.iter()
.fold(String::new(), |mut s, fix| {
s.push_str(fix);
s.push_str(", ");
s
});
return format!(r#"[{}]"#, &half[..half.len() - 2]);
}
if fn_name.contains('-') {
let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
assert_eq!(fn_format.len(), 3);
@ -1018,6 +1063,10 @@ fn get_call(
} else if fn_format[1] == "noqself" {
fn_name.push_str(type_to_noq_suffix(in_t[1]));
} else if fn_format[1] == "nosuffix" {
} else if fn_format[1] == "in_len" {
fn_name.push_str(&type_len(in_t[1]).to_string());
} else if fn_format[1] == "out_len" {
fn_name.push_str(&type_len(out_t).to_string());
} else {
fn_name.push_str(&fn_format[1]);
};