Add vst neon instructions (#1205)

* add vst neon instructions

* modify the instruction limit
This commit is contained in:
Sparrow Li 2021-09-01 04:35:30 +08:00 committed by GitHub
parent c9e0420448
commit 9e34c6d4c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 2455 additions and 273 deletions

View file

@ -4592,6 +4592,84 @@ pub unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t {
vld1q_f64_x4_(a)
}
/// Store multiple single-element structures to one, two, three, or four registers
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(st1))]
pub unsafe fn vst1_f64_x2(a: *mut f64, b: float64x1x2_t) {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v1f64.p0f64")]
fn vst1_f64_x2_(a: float64x1_t, b: float64x1_t, ptr: *mut f64);
}
vst1_f64_x2_(b.0, b.1, a)
}
/// Store multiple single-element structures to one, two, three, or four registers
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(st1))]
pub unsafe fn vst1q_f64_x2(a: *mut f64, b: float64x2x2_t) {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x2.v2f64.p0f64")]
fn vst1q_f64_x2_(a: float64x2_t, b: float64x2_t, ptr: *mut f64);
}
vst1q_f64_x2_(b.0, b.1, a)
}
/// Store multiple single-element structures to one, two, three, or four registers
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(st1))]
pub unsafe fn vst1_f64_x3(a: *mut f64, b: float64x1x3_t) {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v1f64.p0f64")]
fn vst1_f64_x3_(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut f64);
}
vst1_f64_x3_(b.0, b.1, b.2, a)
}
/// Store multiple single-element structures to one, two, three, or four registers
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(st1))]
pub unsafe fn vst1q_f64_x3(a: *mut f64, b: float64x2x3_t) {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x3.v2f64.p0f64")]
fn vst1q_f64_x3_(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut f64);
}
vst1q_f64_x3_(b.0, b.1, b.2, a)
}
/// Store multiple single-element structures to one, two, three, or four registers
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(st1))]
pub unsafe fn vst1_f64_x4(a: *mut f64, b: float64x1x4_t) {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v1f64.p0f64")]
fn vst1_f64_x4_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut f64);
}
vst1_f64_x4_(b.0, b.1, b.2, b.3, a)
}
/// Store multiple single-element structures to one, two, three, or four registers
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(st1))]
pub unsafe fn vst1q_f64_x4(a: *mut f64, b: float64x2x4_t) {
#[allow(improper_ctypes)]
extern "unadjusted" {
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st1x4.v2f64.p0f64")]
fn vst1q_f64_x4_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut f64);
}
vst1q_f64_x4_(b.0, b.1, b.2, b.3, a)
}
/// Multiply
#[inline]
#[target_feature(enable = "neon")]
@ -12983,6 +13061,60 @@ mod test {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vst1_f64_x2() {
let a: [f64; 3] = [0., 1., 2.];
let e: [f64; 2] = [1., 2.];
let mut r: [f64; 2] = [0f64; 2];
vst1_f64_x2(r.as_mut_ptr(), vld1_f64_x2(a[1..].as_ptr()));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vst1q_f64_x2() {
let a: [f64; 5] = [0., 1., 2., 3., 4.];
let e: [f64; 4] = [1., 2., 3., 4.];
let mut r: [f64; 4] = [0f64; 4];
vst1q_f64_x2(r.as_mut_ptr(), vld1q_f64_x2(a[1..].as_ptr()));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vst1_f64_x3() {
let a: [f64; 4] = [0., 1., 2., 3.];
let e: [f64; 3] = [1., 2., 3.];
let mut r: [f64; 3] = [0f64; 3];
vst1_f64_x3(r.as_mut_ptr(), vld1_f64_x3(a[1..].as_ptr()));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vst1q_f64_x3() {
let a: [f64; 7] = [0., 1., 2., 3., 4., 5., 6.];
let e: [f64; 6] = [1., 2., 3., 4., 5., 6.];
let mut r: [f64; 6] = [0f64; 6];
vst1q_f64_x3(r.as_mut_ptr(), vld1q_f64_x3(a[1..].as_ptr()));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vst1_f64_x4() {
let a: [f64; 5] = [0., 1., 2., 3., 4.];
let e: [f64; 4] = [1., 2., 3., 4.];
let mut r: [f64; 4] = [0f64; 4];
vst1_f64_x4(r.as_mut_ptr(), vld1_f64_x4(a[1..].as_ptr()));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vst1q_f64_x4() {
let a: [f64; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
let e: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
let mut r: [f64; 8] = [0f64; 8];
vst1q_f64_x4(r.as_mut_ptr(), vld1q_f64_x4(a[1..].as_ptr()));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vmul_f64() {
let a: f64 = 1.0;

View file

@ -955,6 +955,7 @@ multi_fn = static_assert-N-1-bits
a = 1, 2, 3, 4
n = 2
validate 0.25, 0.5, 0.75, 1.
arm-aarch64-separate
aarch64 = scvtf
link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
@ -971,6 +972,7 @@ link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
arm = vcvt
link-arm = vcvtfxs2fp._EXT2_._EXT_
const-arm = N:i32
generate int32x2_t:float32x2_t, int32x4_t:float32x4_t
aarch64 = ucvtf
@ -988,6 +990,7 @@ multi_fn = static_assert-N-1-bits
a = 0.25, 0.5, 0.75, 1.
n = 2
validate 1, 2, 3, 4
arm-aarch64-separate
aarch64 = fcvtzs
link-aarch64 = vcvtfp2fxs._EXT2_._EXT_
@ -2038,7 +2041,7 @@ name = vld1
out-suffix
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
test = load_test
load_fn
aarch64 = ld1
link-aarch64 = ld1x2._EXT2_
@ -2064,7 +2067,7 @@ multi_fn = transmute, {vld1-outsigned-noext, transmute(a)}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
test = load_test
load_fn
aarch64 = ld1
arm = vld1
generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
@ -2083,7 +2086,7 @@ name = vld1
out-suffix
a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
test = load_test
load_fn
aarch64 = ld1
link-aarch64 = ld1x2._EXT2_
@ -2108,6 +2111,80 @@ link-aarch64 = ld1x4._EXT2_
link-arm = vld1x4._EXT2_
generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
/// Store multiple single-element structures from one, two, three, or four registers
name = vst1
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
store_fn
arm-aarch64-separate
aarch64 = st1
link-aarch64 = st1x2._EXT3_
arm = vst1
link-arm = vst1x2._EXT3_
generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void
generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void
link-aarch64 = st1x3._EXT3_
link-arm = vst1x3._EXT3_
generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void
generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void
link-aarch64 = st1x4._EXT3_
link-arm = vst1x4._EXT3_
generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void
generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void
/// Store multiple single-element structures to one, two, three, or four registers
name = vst1
multi_fn = vst1-signed-noext, transmute(a), transmute(b)
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
store_fn
aarch64 = st1
arm = vst1
generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void
generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void
generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void
generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void
generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void
generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void
generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void
generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void
generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void
generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void
/// Store multiple single-element structures to one, two, three, or four registers
name = vst1
a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
store_fn
arm-aarch64-separate
aarch64 = st1
link-aarch64 = st1x2._EXT3_
generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
link-aarch64 = st1x3._EXT3_
generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
link-aarch64 = st1x4._EXT3_
generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
arm = vst1
link-aarch64 = st1x2._EXT3_
link-arm = vst1x2._EXT3_
generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
link-aarch64 = st1x3._EXT3_
link-arm = vst1x3._EXT3_
generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
link-aarch64 = st1x4._EXT3_
link-arm = vst1x4._EXT3_
generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
/// Multiply
name = vmul
a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@ -3869,6 +3946,7 @@ const-aarch64 = N
arm = vqrshrn
link-arm = vqrshiftns._EXT2_
const-arm = -N as ttn
arm-aarch64-separate
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
/// Signed saturating rounded shift right narrow
@ -3915,6 +3993,7 @@ const-aarch64 = N
arm = vqrshrn
link-arm = vqrshiftnu._EXT2_
const-arm = -N as ttn
arm-aarch64-separate
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
/// Unsigned saturating rounded shift right narrow
@ -3961,6 +4040,7 @@ const-aarch64 = N
arm = vqrshrun
link-arm = vqrshiftnsu._EXT2_
const-arm = -N as ttn
arm-aarch64-separate
generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
/// Signed saturating rounded shift right unsigned narrow
@ -4106,6 +4186,7 @@ multi_fn = static_assert-N-1-halfbits
a = 0, 4, 8, 12, 16, 20, 24, 28
n = 2
validate 0, 1, 2, 3, 4, 5, 6, 7
arm-aarch64-separate
aarch64 = sqshrn
link-aarch64 = sqshrn._EXT2_
@ -4152,6 +4233,7 @@ multi_fn = static_assert-N-1-halfbits
a = 0, 4, 8, 12, 16, 20, 24, 28
n = 2
validate 0, 1, 2, 3, 4, 5, 6, 7
arm-aarch64-separate
aarch64 = uqshrn
link-aarch64 = uqshrn._EXT2_
@ -4198,6 +4280,7 @@ multi_fn = static_assert-N-1-halfbits
a = 0, 4, 8, 12, 16, 20, 24, 28
n = 2
validate 0, 1, 2, 3, 4, 5, 6, 7
arm-aarch64-separate
aarch64 = sqshrun
link-aarch64 = sqshrun._EXT2_
@ -4542,6 +4625,7 @@ multi_fn = static_assert-N-1-halfbits
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
n = 2
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
arm-aarch64-separate
aarch64 = rshrn
link-aarch64 = rshrn._EXT2_

File diff suppressed because it is too large Load diff

View file

@ -124,6 +124,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
// vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
"usad8" | "vfma" | "vfms" => 27,
"qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
// core_arch/src/arm_shared/simd32
// vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit)
"vst1" => 41,
// Temporary, currently the fptosi.sat and fptoui.sat LLVM
// intrinsics emit unnecessary code on arm. This can be