diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs index 6ede0275b4e8..e87ce56f7c26 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs @@ -1617,6 +1617,66 @@ pub unsafe fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float simd_add(a, simd_mul(b, c)) } +/// Signed multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smlal2))] +pub unsafe fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { + let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let c: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); + vmlal_s8(a, b, c) +} + +/// Signed multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smlal2))] +pub unsafe fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + let c: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]); + vmlal_s16(a, b, c) +} + +/// Signed multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smlal2))] +pub unsafe fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + let b: int32x2_t = simd_shuffle2(b, b, [2, 3]); + let c: int32x2_t = simd_shuffle2(c, c, [2, 3]); + vmlal_s32(a, b, c) +} + +/// Unsigned multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umlal2))] +pub unsafe fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t { + let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let c: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); + vmlal_u8(a, b, c) +} + +/// Unsigned multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umlal2))] +pub unsafe fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t { + let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + let c: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]); + vmlal_u16(a, b, c) +} + +/// Unsigned multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umlal2))] +pub unsafe fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t { + let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]); + let c: uint32x2_t = simd_shuffle2(c, c, [2, 3]); + vmlal_u32(a, b, c) +} + /// Floating-point multiply-subtract from accumulator #[inline] #[target_feature(enable = "neon")] @@ -1633,6 +1693,66 @@ pub unsafe fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float simd_sub(a, simd_mul(b, c)) } +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smlsl2))] +pub unsafe fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t { + let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let c: int8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); + vmlsl_s8(a, b, c) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smlsl2))] +pub unsafe fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + let c: int16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]); + vmlsl_s16(a, b, c) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smlsl2))] +pub unsafe fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + let b: int32x2_t = simd_shuffle2(b, b, [2, 3]); + let c: int32x2_t = simd_shuffle2(c, c, [2, 3]); + vmlsl_s32(a, b, c) +} + +/// Unsigned multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umlsl2))] +pub unsafe fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t { + let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + let c: uint8x8_t = simd_shuffle8(c, c, [8, 9, 10, 11, 12, 13, 14, 15]); + vmlsl_u8(a, b, c) +} + +/// Unsigned multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umlsl2))] +pub unsafe fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t { + let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + let c: uint16x4_t = simd_shuffle4(c, c, [4, 5, 6, 7]); + vmlsl_u16(a, b, c) +} + +/// Unsigned multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umlsl2))] +pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t { + let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]); + let c: uint32x2_t = simd_shuffle2(c, c, [2, 3]); + vmlsl_u32(a, b, c) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -1649,6 +1769,76 @@ pub unsafe fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { simd_mul(a, b) } +/// Signed multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2))] +pub unsafe fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t { + let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + vmull_s8(a, b) +} + +/// Signed multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2))] +pub unsafe fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]); + let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + vmull_s16(a, b) +} + +/// Signed multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(smull2))] +pub unsafe fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + let a: int32x2_t = simd_shuffle2(a, a, [2, 3]); + let b: int32x2_t = simd_shuffle2(b, b, [2, 3]); + vmull_s32(a, b) +} + +/// Unsigned multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2))] +pub unsafe fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t { + let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + vmull_u8(a, b) +} + +/// Unsigned multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2))] +pub unsafe fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t { + let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]); + let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + vmull_u16(a, b) +} + +/// Unsigned multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(umull2))] +pub unsafe fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { + let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]); + let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]); + vmull_u32(a, b) +} + +/// Polynomial multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(pmull))] +pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t { + let a: poly8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]); + let b: poly8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]); + vmull_p8(a, b) +} + /// Divide #[inline] #[target_feature(enable = "neon")] @@ -3258,6 +3448,66 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_high_s8() { + let a: i16x8 = i16x8::new(8, 7, 6, 5, 4, 3, 2, 1); + let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + let c: i8x16 = i8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7); + let e: i16x8 = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15); + let r: i16x8 = transmute(vmlal_high_s8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_high_s16() { + let a: i32x4 = i32x4::new(8, 7, 6, 5); + let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3); + let e: i32x4 = i32x4::new(8, 9, 10, 11); + let r: i32x4 = transmute(vmlal_high_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_high_s32() { + let a: i64x2 = i64x2::new(8, 7); + let b: i32x4 = i32x4::new(2, 2, 2, 2); + let c: i32x4 = i32x4::new(3, 3, 0, 1); + let e: i64x2 = i64x2::new(8, 9); + let r: i64x2 = transmute(vmlal_high_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_high_u8() { + let a: u16x8 = u16x8::new(8, 7, 6, 5, 4, 3, 2, 1); + let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + let c: u8x16 = u8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7); + let e: u16x8 = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15); + let r: u16x8 = transmute(vmlal_high_u8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_high_u16() { + let a: u32x4 = u32x4::new(8, 7, 6, 5); + let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3); + let e: u32x4 = u32x4::new(8, 9, 10, 11); + let r: u32x4 = transmute(vmlal_high_u16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_high_u32() { + let a: u64x2 = u64x2::new(8, 7); + let b: u32x4 = u32x4::new(2, 2, 2, 2); + let c: u32x4 = u32x4::new(3, 3, 0, 1); + let e: u64x2 = u64x2::new(8, 9); + let r: u64x2 = transmute(vmlal_high_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmls_f64() { let a: f64 = 6.; @@ -3278,6 +3528,66 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_high_s8() { + let a: i16x8 = i16x8::new(14, 15, 16, 17, 18, 19, 20, 21); + let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + let c: i8x16 = i8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7); + let e: i16x8 = i16x8::new(14, 13, 12, 11, 10, 9, 8, 7); + let r: i16x8 = transmute(vmlsl_high_s8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_high_s16() { + let a: i32x4 = i32x4::new(14, 15, 16, 17); + let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3); + let e: i32x4 = i32x4::new(14, 13, 12, 11); + let r: i32x4 = transmute(vmlsl_high_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_high_s32() { + let a: i64x2 = i64x2::new(14, 15); + let b: i32x4 = i32x4::new(2, 2, 2, 2); + let c: i32x4 = i32x4::new(3, 3, 0, 1); + let e: i64x2 = i64x2::new(14, 13); + let r: i64x2 = transmute(vmlsl_high_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_high_u8() { + let a: u16x8 = u16x8::new(14, 15, 16, 17, 18, 19, 20, 21); + let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + let c: u8x16 = u8x16::new(3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7); + let e: u16x8 = u16x8::new(14, 13, 12, 11, 10, 9, 8, 7); + let r: u16x8 = transmute(vmlsl_high_u8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_high_u16() { + let a: u32x4 = u32x4::new(14, 15, 16, 17); + let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3); + let e: u32x4 = u32x4::new(14, 13, 12, 11); + let r: u32x4 = transmute(vmlsl_high_u16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_high_u32() { + let a: u64x2 = u64x2::new(14, 15); + let b: u32x4 = u32x4::new(2, 2, 2, 2); + let c: u32x4 = u32x4::new(3, 3, 0, 1); + let e: u64x2 = u64x2::new(14, 13); + let r: u64x2 = transmute(vmlsl_high_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f64() { let a: f64 = 1.0; @@ -3296,6 +3606,69 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_s8() { + let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16); + let b: i8x16 = i8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2); + let e: i16x8 = i16x8::new(9, 20, 11, 24, 13, 28, 15, 32); + let r: i16x8 = transmute(vmull_high_s8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_s16() { + let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: i16x8 = i16x8::new(1, 2, 1, 2, 1, 2, 1, 2); + let e: i32x4 = i32x4::new(9, 20, 11, 24); + let r: i32x4 = transmute(vmull_high_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_s32() { + let a: i32x4 = i32x4::new(1, 2, 9, 10); + let b: i32x4 = i32x4::new(1, 2, 1, 2); + let e: i64x2 = i64x2::new(9, 20); + let r: i64x2 = transmute(vmull_high_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_u8() { + let a: u8x16 = u8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16); + let b: u8x16 = u8x16::new(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2); + let e: u16x8 = u16x8::new(9, 20, 11, 24, 13, 28, 15, 32); + let r: u16x8 = transmute(vmull_high_u8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_u16() { + let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12); + let b: u16x8 = u16x8::new(1, 2, 1, 2, 1, 2, 1, 2); + let e: u32x4 = u32x4::new(9, 20, 11, 24); + let r: u32x4 = transmute(vmull_high_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_u32() { + let a: u32x4 = u32x4::new(1, 2, 9, 10); + let b: u32x4 = u32x4::new(1, 2, 1, 2); + let e: u64x2 = u64x2::new(9, 20); + let r: u64x2 = transmute(vmull_high_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_high_p8() { + let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16); + let b: i8x16 = i8x16::new(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3); + let e: i16x8 = i16x8::new(9, 30, 11, 20, 13, 18, 15, 48); + let r: i16x8 = transmute(vmull_high_p8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vdiv_f32() { let a: f32x2 = f32x2::new(2.0, 6.0); diff --git a/library/stdarch/crates/core_arch/src/arm/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm/neon/generated.rs index 8031d4ad066c..7a1e983e68ca 100644 --- a/library/stdarch/crates/core_arch/src/arm/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/arm/neon/generated.rs @@ -2121,6 +2121,66 @@ pub unsafe fn vmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float simd_add(a, simd_mul(b, c)) } +/// Signed multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))] +pub unsafe fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t { + simd_add(a, vmull_s8(b, c)) +} + +/// Signed multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))] +pub unsafe fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { + simd_add(a, vmull_s16(b, c)) +} + +/// Signed multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))] +pub unsafe fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { + simd_add(a, vmull_s32(b, c)) +} + +/// Unsigned multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))] +pub unsafe fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t { + simd_add(a, vmull_u8(b, c)) +} + +/// Unsigned multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))] +pub unsafe fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t { + simd_add(a, vmull_u16(b, c)) +} + +/// Unsigned multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))] +pub unsafe fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t { + simd_add(a, vmull_u32(b, c)) +} + /// Multiply-subtract from accumulator #[inline] #[target_feature(enable = "neon")] @@ -2261,6 +2321,66 @@ pub unsafe fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float simd_sub(a, simd_mul(b, c)) } +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))] +pub unsafe fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t { + simd_sub(a, vmull_s8(b, c)) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))] +pub unsafe fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { + simd_sub(a, vmull_s16(b, c)) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))] +pub unsafe fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { + simd_sub(a, vmull_s32(b, c)) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))] +pub unsafe fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t { + simd_sub(a, vmull_u8(b, c)) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))] +pub unsafe fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t { + simd_sub(a, vmull_u16(b, c)) +} + +/// Signed multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))] +pub unsafe fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t { + simd_sub(a, vmull_u32(b, c)) +} + /// Saturating subtract #[inline] #[target_feature(enable = "neon")] @@ -3297,6 +3417,118 @@ pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { simd_mul(a, b) } +/// Signed multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] +pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")] + fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t; + } +vmull_s8_(a, b) +} + +/// Signed multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] +pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")] + fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t; + } +vmull_s16_(a, b) +} + +/// Signed multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] +pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")] + fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t; + } +vmull_s32_(a, b) +} + +/// Unsigned multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] +pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")] + fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t; + } +vmull_u8_(a, b) +} + +/// Unsigned multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] +pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")] + fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t; + } +vmull_u16_(a, b) +} + +/// Unsigned multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] +pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")] + fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t; + } +vmull_u32_(a, b) +} + +/// Polynomial multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))] +pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")] + fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t; + } +vmull_p8_(a, b) +} + /// Subtract #[inline] #[target_feature(enable = "neon")] @@ -6105,6 +6337,66 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_s8() { + let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13); + let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_s16() { + let a: i32x4 = i32x4::new(0, 1, 2, 3); + let b: i16x4 = i16x4::new(2, 2, 2, 2); + let c: i16x4 = i16x4::new(3, 3, 3, 3); + let e: i32x4 = i32x4::new(6, 7, 8, 9); + let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_s32() { + let a: i64x2 = i64x2::new(0, 1); + let b: i32x2 = i32x2::new(2, 2); + let c: i32x2 = i32x2::new(3, 3); + let e: i64x2 = i64x2::new(6, 7); + let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_u8() { + let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13); + let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_u16() { + let a: u32x4 = u32x4::new(0, 1, 2, 3); + let b: u16x4 = u16x4::new(2, 2, 2, 2); + let c: u16x4 = u16x4::new(3, 3, 3, 3); + let e: u32x4 = u32x4::new(6, 7, 8, 9); + let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlal_u32() { + let a: u64x2 = u64x2::new(0, 1); + let b: u32x2 = u32x2::new(2, 2); + let c: u32x2 = u32x2::new(3, 3); + let e: u64x2 = u64x2::new(6, 7); + let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmls_s8() { let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13); @@ -6245,6 +6537,66 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_s8() { + let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13); + let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let r: i16x8 = transmute(vmlsl_s8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_s16() { + let a: i32x4 = i32x4::new(6, 7, 8, 9); + let b: i16x4 = i16x4::new(2, 2, 2, 2); + let c: i16x4 = i16x4::new(3, 3, 3, 3); + let e: i32x4 = i32x4::new(0, 1, 2, 3); + let r: i32x4 = transmute(vmlsl_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_s32() { + let a: i64x2 = i64x2::new(6, 7); + let b: i32x2 = i32x2::new(2, 2); + let c: i32x2 = i32x2::new(3, 3); + let e: i64x2 = i64x2::new(0, 1); + let r: i64x2 = transmute(vmlsl_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_u8() { + let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13); + let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); + let r: u16x8 = transmute(vmlsl_u8(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_u16() { + let a: u32x4 = u32x4::new(6, 7, 8, 9); + let b: u16x4 = u16x4::new(2, 2, 2, 2); + let c: u16x4 = u16x4::new(3, 3, 3, 3); + let e: u32x4 = u32x4::new(0, 1, 2, 3); + let r: u32x4 = transmute(vmlsl_u16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmlsl_u32() { + let a: u64x2 = u64x2::new(6, 7); + let b: u32x2 = u32x2::new(2, 2); + let c: u32x2 = u32x2::new(3, 3); + let e: u64x2 = u64x2::new(0, 1); + let r: u64x2 = transmute(vmlsl_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vqsub_u8() { let a: u8x8 = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42); @@ -6875,6 +7227,69 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vmull_s8() { + let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2); + let e: i16x8 = i16x8::new(1, 4, 3, 8, 5, 12, 7, 16); + let r: i16x8 = transmute(vmull_s8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x4 = i16x4::new(1, 2, 1, 2); + let e: i32x4 = i32x4::new(1, 4, 3, 8); + let r: i32x4 = transmute(vmull_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x2 = i32x2::new(1, 2); + let e: i64x2 = i64x2::new(1, 4); + let r: i64x2 = transmute(vmull_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_u8() { + let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: u8x8 = u8x8::new(1, 2, 1, 2, 1, 2, 1, 2); + let e: u16x8 = u16x8::new(1, 4, 3, 8, 5, 12, 7, 16); + let r: u16x8 = transmute(vmull_u8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_u16() { + let a: u16x4 = u16x4::new(1, 2, 3, 4); + let b: u16x4 = u16x4::new(1, 2, 1, 2); + let e: u32x4 = u32x4::new(1, 4, 3, 8); + let r: u32x4 = transmute(vmull_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u32x2 = u32x2::new(1, 2); + let e: u64x2 = u64x2::new(1, 4); + let r: u64x2 = transmute(vmull_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vmull_p8() { + let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: i8x8 = i8x8::new(1, 3, 1, 3, 1, 3, 1, 3); + let e: i16x8 = i16x8::new(1, 6, 3, 12, 5, 10, 7, 24); + let r: i16x8 = transmute(vmull_p8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vsub_s8() { let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec index a2a3d0382512..fe32cc8ed1ab 100644 --- a/library/stdarch/crates/stdarch-gen/neon.spec +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -784,6 +784,60 @@ generate float64x*_t arm = vmla. generate float*_t +/// Signed multiply-add long +name = vmlal +multi_fn = simd_add, a, {vmull-self-noext, b, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmlal.s +aarch64 = smlal +generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Unsigned multiply-add long +name = vmlal +multi_fn = simd_add, a, {vmull-self-noext, b, c} +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + +arm = vmlal.s +aarch64 = umlal +generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t + +/// Signed multiply-add long +name = vmlal_high +no-q +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right} +multi_fn = vmlal-noqself-noext, a, b, c +a = 8, 7, 6, 5, 4, 3, 2, 1 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = smlal2 +generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Unsigned multiply-add long +name = vmlal_high +no-q +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right} +multi_fn = vmlal-noqself-noext, a, b, c +a = 8, 7, 6, 5, 4, 3, 2, 1 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 8, 9, 10, 11, 12, 13, 14, 15 + +aarch64 = umlal2 +generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + /// Multiply-subtract from accumulator name = vmls multi_fn = simd_sub, a, {simd_mul, b, c} @@ -810,6 +864,60 @@ generate float64x*_t arm = vmls. generate float*_t +/// Signed multiply-subtract long +name = vmlsl +multi_fn = simd_sub, a, {vmull-self-noext, b, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmlsl.s +aarch64 = smlsl +generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Signed multiply-subtract long +name = vmlsl +multi_fn = simd_sub, a, {vmull-self-noext, b, c} +a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +arm = vmlsl.s +aarch64 = umlsl +generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t + +/// Signed multiply-subtract long +name = vmlsl_high +no-q +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right} +multi_fn = vmlsl-noqself-noext, a, b, c +a = 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 14, 13, 12, 11, 10, 9, 8, 7 + +aarch64 = smlsl2 +generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Unsigned multiply-subtract long +name = vmlsl_high +no-q +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right} +multi_fn = vmlsl-noqself-noext, a, b, c +a = 14, 15, 16, 17, 18, 19, 20, 21 +b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 14, 13, 12, 11, 10, 9, 8, 7 + +aarch64 = umlsl2 +generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t + /// Saturating subtract name = vqsub a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 @@ -907,6 +1015,84 @@ generate float64x*_t arm = vmul. generate float*_t +/// Signed multiply long +name = vmull +a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 + +arm = vmull.s +aarch64 = smull +link-arm = vmulls._EXT_ +link-aarch64 = smull._EXT_ +generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Signed multiply long +name = vmull_high +no-q +multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = vmull-noqself-noext, a, b +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 9, 20, 11, 24, 13, 28, 15, 32 + +aarch64 = smull2 +generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t + +/// Unsigned multiply long +name = vmull +a = 1, 2, 3, 4, 5, 6, 7, 8 +b = 1, 2, 1, 2, 1, 2, 1, 2 +validate 1, 4, 3, 8, 5, 12, 7, 16 + +arm = vmull.s +aarch64 = umull +link-arm = vmullu._EXT_ +link-aarch64 = umull._EXT_ +generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t + +/// Unsigned multiply long +name = vmull_high +no-q +multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = vmull-noqself-noext, a, b +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 9, 20, 11, 24, 13, 28, 15, 32 + +aarch64 = umull2 +generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t + +/// Polynomial multiply long +name = vmull +a = 1, 2, 3, 4, 5, 6, 7, 8 +b = 1, 3, 1, 3, 1, 3, 1, 3 +validate 1, 6, 3, 12, 5, 10, 7, 24 + +arm = vmull.s +aarch64 = pmull +link-arm = vmullp._EXT_ +link-aarch64 = pmull._EXT_ +generate poly8x8_t:poly8x8_t:poly16x8_t + +/// Polynomial multiply long +name = vmull_high +no-q +multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right} +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right} +multi_fn = vmull-noqself-noext, a, b +a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 +b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 +fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +validate 9, 30, 11, 20, 13, 18, 15, 48 + +aarch64 = pmull +generate poly8x16_t:poly8x16_t:poly16x8_t + /// Divide name = vdiv fn = simd_div diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs index 59073d1f4714..2d323be79cde 100644 --- a/library/stdarch/crates/stdarch-gen/src/main.rs +++ b/library/stdarch/crates/stdarch-gen/src/main.rs @@ -75,6 +75,8 @@ fn type_len(t: &str) -> usize { "float64x2_t" => 2, "poly8x8_t" => 8, "poly8x16_t" => 16, + "poly16x4_t" => 4, + "poly16x8_t" => 8, "poly64x1_t" => 1, "poly64x2_t" => 2, _ => panic!("unknown type: {}", t), @@ -231,6 +233,8 @@ fn type_to_global_type(t: &str) -> &str { "float64x2_t" => "f64x2", "poly8x8_t" => "i8x8", "poly8x16_t" => "i8x16", + "poly16x4_t" => "i16x4", + "poly16x8_t" => "i16x8", "poly64x1_t" => "i64x1", "poly64x2_t" => "i64x2", _ => panic!("unknown type: {}", t), @@ -291,6 +295,10 @@ fn type_to_ext(t: &str) -> &str { "float32x4_t" => "v4f32", "float64x1_t" => "v1f64", "float64x2_t" => "v2f64", + "poly8x8_t" => "v8i8", + "poly8x16_t" => "v16i8", + "poly16x4_t" => "v4i16", + "poly16x8_t" => "v8i16", /* "poly64x1_t" => "i64x1", "poly64x2_t" => "i64x2", @@ -299,6 +307,24 @@ fn type_to_ext(t: &str) -> &str { } } +fn type_to_half(t: &str) -> &str { + match t { + "int8x16_t" => "int8x8_t", + "int16x8_t" => "int16x4_t", + "int32x4_t" => "int32x2_t", + "int64x2_t" => "int64x1_t", + "uint8x16_t" => "uint8x8_t", + "uint16x8_t" => "uint16x4_t", + "uint32x4_t" => "uint32x2_t", + "uint64x2_t" => "uint64x1_t", + "poly8x16_t" => "poly8x8_t", + "poly16x8_t" => "poly16x4_t", + "float32x4_t" => "float32x2_t", + "float64x2_t" => "float64x1_t", + _ => panic!("unknown half type for {}", t), + } +} + fn values(t: &str, vs: &[String]) -> String { if vs.len() == 1 && !t.contains('x') { format!(": {} = {}", t, vs[0]) @@ -588,7 +614,7 @@ fn gen_aarch64( in_t, &out_t, current_tests, - [type_len(in_t[0]), type_len(in_t[0]), type_len(in_t[0])], + [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])], type_len(out_t), para_num, ); @@ -843,8 +869,8 @@ fn gen_arm( {} "#, current_comment, - expand_intrinsic(¤t_arm, in_t[0]), - expand_intrinsic(¤t_aarch64, in_t[0]), + expand_intrinsic(¤t_arm, in_t[1]), + expand_intrinsic(¤t_aarch64, in_t[1]), call, ); let test = gen_test( @@ -887,6 +913,8 @@ fn expand_intrinsic(intr: &str, t: &str) -> String { "float64x2_t" => "f64", "poly8x8_t" => "i8", "poly8x16_t" => "i8", + "poly16x4_t" => "i16", + "poly16x8_t" => "i16", /* "poly64x1_t" => "i64x1", "poly64x2_t" => "i64x2", @@ -912,6 +940,10 @@ fn expand_intrinsic(intr: &str, t: &str) -> String { "uint32x4_t" => "u32", "uint64x1_t" => "u64", "uint64x2_t" => "u64", + "poly8x8_t" => "p8", + "poly8x16_t" => "p8", + "poly16x4_t" => "p16", + "poly16x8_t" => "p16", "float16x4_t" => "f16", "float16x8_t" => "f16", "float32x2_t" => "f32", @@ -978,11 +1010,13 @@ fn get_call( } else if s.contains(':') { let re_params: Vec<_> = s.split(':').map(|v| v.to_string()).collect(); if re_params[1] == "" { - re = Some((re_params[0].clone(), in_t[0].to_string())); + re = Some((re_params[0].clone(), in_t[1].to_string())); } else if re_params[1] == "in_t" { - re = Some((re_params[0].clone(), in_t[0].to_string())); + re = Some((re_params[0].clone(), in_t[1].to_string())); } else if re_params[1] == "out_t" { re = Some((re_params[0].clone(), out_t.to_string())); + } else if re_params[1] == "half" { + re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string())); } else { re = Some((re_params[0].clone(), re_params[1].clone())); } @@ -996,9 +1030,20 @@ fn get_call( } if fn_name == "fixed" { let (re_name, re_type) = re.unwrap(); - let fixed: Vec = fixed.iter().take(type_len(in_t[0])).cloned().collect(); + let fixed: Vec = fixed.iter().take(type_len(in_t[1])).cloned().collect(); return format!(r#"let {}{};"#, re_name, values(&re_type, &fixed)); } + if fn_name == "fixed-half-right" { + let fixed: Vec = fixed.iter().take(type_len(in_t[1])).cloned().collect(); + let half = fixed[type_len(in_t[1]) / 2..] + .iter() + .fold(String::new(), |mut s, fix| { + s.push_str(fix); + s.push_str(", "); + s + }); + return format!(r#"[{}]"#, &half[..half.len() - 2]); + } if fn_name.contains('-') { let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect(); assert_eq!(fn_format.len(), 3); @@ -1018,6 +1063,10 @@ fn get_call( } else if fn_format[1] == "noqself" { fn_name.push_str(type_to_noq_suffix(in_t[1])); } else if fn_format[1] == "nosuffix" { + } else if fn_format[1] == "in_len" { + fn_name.push_str(&type_len(in_t[1]).to_string()); + } else if fn_format[1] == "out_len" { + fn_name.push_str(&type_len(out_t).to_string()); } else { fn_name.push_str(&fn_format[1]); };