add vldx neon instructions (#1200)

2021-08-25 02:51:30 +08:00 · 2021-08-25 02:51:30 +08:00 · 4baf95fddd
commit 4baf95fddd
parent b10d00cae0
19 changed files with 2730 additions and 703 deletions
--- a/library/stdarch/crates/core_arch/src/aarch64/crc.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/crc.rs
@ -1,4 +1,4 @@
-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.aarch64.crc32x"]
    fn crc32x_(crc: u32, data: u64) -> u32;

--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@ -25,48 +25,38 @@ types! {
    pub struct float64x2_t(f64, f64);
 }

-/// ARM-specific type containing two `int8x16_t` vectors.
+/// ARM-specific type containing two `float64x1_t` vectors.
 #[derive(Copy, Clone)]
-pub struct int8x16x2_t(pub int8x16_t, pub int8x16_t);
-/// ARM-specific type containing three `int8x16_t` vectors.
+pub struct float64x1x2_t(pub float64x1_t, pub float64x1_t);
+/// ARM-specific type containing three `float64x1_t` vectors.
 #[derive(Copy, Clone)]
-pub struct int8x16x3_t(pub int8x16_t, pub int8x16_t, pub int8x16_t);
-/// ARM-specific type containing four `int8x16_t` vectors.
+pub struct float64x1x3_t(pub float64x1_t, pub float64x1_t, pub float64x1_t);
+/// ARM-specific type containing four `float64x1_t` vectors.
 #[derive(Copy, Clone)]
-pub struct int8x16x4_t(pub int8x16_t, pub int8x16_t, pub int8x16_t, pub int8x16_t);
-
-/// ARM-specific type containing two `uint8x16_t` vectors.
-#[derive(Copy, Clone)]
-pub struct uint8x16x2_t(pub uint8x16_t, pub uint8x16_t);
-/// ARM-specific type containing three `uint8x16_t` vectors.
-#[derive(Copy, Clone)]
-pub struct uint8x16x3_t(pub uint8x16_t, pub uint8x16_t, pub uint8x16_t);
-/// ARM-specific type containing four `uint8x16_t` vectors.
-#[derive(Copy, Clone)]
-pub struct uint8x16x4_t(
-    pub uint8x16_t,
-    pub uint8x16_t,
-    pub uint8x16_t,
-    pub uint8x16_t,
+pub struct float64x1x4_t(
+    pub float64x1_t,
+    pub float64x1_t,
+    pub float64x1_t,
+    pub float64x1_t,
 );

-/// ARM-specific type containing two `poly8x16_t` vectors.
+/// ARM-specific type containing two `float64x2_t` vectors.
 #[derive(Copy, Clone)]
-pub struct poly8x16x2_t(pub poly8x16_t, pub poly8x16_t);
-/// ARM-specific type containing three `poly8x16_t` vectors.
+pub struct float64x2x2_t(pub float64x2_t, pub float64x2_t);
+/// ARM-specific type containing three `float64x2_t` vectors.
 #[derive(Copy, Clone)]
-pub struct poly8x16x3_t(pub poly8x16_t, pub poly8x16_t, pub poly8x16_t);
-/// ARM-specific type containing four `poly8x16_t` vectors.
+pub struct float64x2x3_t(pub float64x2_t, pub float64x2_t, pub float64x2_t);
+/// ARM-specific type containing four `float64x2_t` vectors.
 #[derive(Copy, Clone)]
-pub struct poly8x16x4_t(
-    pub poly8x16_t,
-    pub poly8x16_t,
-    pub poly8x16_t,
-    pub poly8x16_t,
+pub struct float64x2x4_t(
+    pub float64x2_t,
+    pub float64x2_t,
+    pub float64x2_t,
+    pub float64x2_t,
 );

 #[allow(improper_ctypes)]
-extern "C" {
+extern "unadjusted" {
    // absolute value
    #[link_name = "llvm.aarch64.neon.abs.i64"]
    fn vabsd_s64_(a: i64) -> i64;
--- a/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/prefetch.rs
@ -1,7 +1,7 @@
 #[cfg(test)]
 use stdarch_test::assert_instr;

-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.prefetch"]
    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
 }
--- a/library/stdarch/crates/core_arch/src/aarch64/tme.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
@ -17,7 +17,7 @@
 #[cfg(test)]
 use stdarch_test::assert_instr;

-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.aarch64.tstart"]
    fn aarch64_tstart() -> u64;
    #[link_name = "llvm.aarch64.tcommit"]
--- a/library/stdarch/crates/core_arch/src/arm/dsp.rs
+++ b/library/stdarch/crates/core_arch/src/arm/dsp.rs
@ -32,7 +32,7 @@ types! {
    pub struct uint16x2_t(u16, u16);
 }

-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.arm.smulbb"]
    fn arm_smulbb(a: i32, b: i32) -> i32;

--- a/library/stdarch/crates/core_arch/src/arm/ex.rs
+++ b/library/stdarch/crates/core_arch/src/arm/ex.rs
@ -11,7 +11,7 @@
    doc
 ))]
 pub unsafe fn __clrex() {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.clrex"]
        fn clrex();
    }
@ -27,7 +27,7 @@ pub unsafe fn __clrex() {
    doc
 ))]
 pub unsafe fn __ldrexb(p: *const u8) -> u8 {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.ldrex.p0i8"]
        fn ldrex8(p: *const u8) -> u32;
    }
@ -43,7 +43,7 @@ pub unsafe fn __ldrexb(p: *const u8) -> u8 {
    doc
 ))]
 pub unsafe fn __ldrexh(p: *const u16) -> u16 {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.ldrex.p0i16"]
        fn ldrex16(p: *const u16) -> u32;
    }
@ -60,7 +60,7 @@ pub unsafe fn __ldrexh(p: *const u16) -> u16 {
    doc
 ))]
 pub unsafe fn __ldrex(p: *const u32) -> u32 {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.ldrex.p0i32"]
        fn ldrex32(p: *const u32) -> u32;
    }
@ -78,7 +78,7 @@ pub unsafe fn __ldrex(p: *const u32) -> u32 {
    doc
 ))]
 pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.strex.p0i8"]
        fn strex8(value: u32, addr: *mut u8) -> u32;
    }
@ -97,7 +97,7 @@ pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
    doc
 ))]
 pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.strex.p0i16"]
        fn strex16(value: u32, addr: *mut u16) -> u32;
    }
@ -116,7 +116,7 @@ pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
    doc
 ))]
 pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
-    extern "C" {
+    extern "unadjusted" {
        #[link_name = "llvm.arm.strex.p0i32"]
        fn strex32(value: u32, addr: *mut u32) -> u32;
    }
--- a/library/stdarch/crates/core_arch/src/arm/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm/mod.rs
@ -107,7 +107,7 @@ pub unsafe fn __dbg<const IMM4: i32>() {
    dbg(IMM4);
 }

-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.arm.dbg"]
    fn dbg(_: i32);
 }
--- a/library/stdarch/crates/core_arch/src/arm/neon.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon.rs
@ -12,7 +12,7 @@ pub(crate) type p8 = u8;
 pub(crate) type p16 = u16;

 #[allow(improper_ctypes)]
-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.arm.neon.vbsl.v8i8"]
    fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
    #[link_name = "llvm.arm.neon.vbsl.v16i8"]
--- a/library/stdarch/crates/core_arch/src/arm/simd32.rs
+++ b/library/stdarch/crates/core_arch/src/arm/simd32.rs
@ -80,7 +80,7 @@ macro_rules! dsp_call {
    };
 }

-extern "C" {
+extern "unadjusted" {
    #[link_name = "llvm.arm.qadd8"]
    fn arm_qadd8(a: i32, b: i32) -> i32;

--- a/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/mod.rs
@ -122,7 +122,7 @@ where
    arg.__isb()
 }

-extern "C" {
+extern "unadjusted" {
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dmb")]
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dmb")]
    fn dmb(_: i32);
--- a/library/stdarch/crates/core_arch/src/arm_shared/crc.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/crc.rs
@ -1,4 +1,4 @@
-extern "C" {
+extern "unadjusted" {
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32b")]
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32b")]
    fn crc32b_(crc: u32, data: u32) -> u32;
--- a/library/stdarch/crates/core_arch/src/arm_shared/crypto.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/crypto.rs
@ -1,7 +1,7 @@
 use crate::core_arch::arm_shared::{uint32x4_t, uint8x16_t};

 #[allow(improper_ctypes)]
-extern "C" {
+extern "unadjusted" {
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.aese")]
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.aese")]
    fn vaeseq_u8_(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t;
--- a/library/stdarch/crates/core_arch/src/arm_shared/hints.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/hints.rs
@ -80,7 +80,7 @@ pub unsafe fn __nop() {
    asm!("nop", options(nomem, nostack, preserves_flags));
 }

-extern "C" {
+extern "unadjusted" {
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.hint")]
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.hint")]
    fn hint(_: i32);
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
@ -92,6 +92,16 @@ pub struct int8x8x3_t(pub int8x8_t, pub int8x8_t, pub int8x8_t);
 #[derive(Copy, Clone)]
 pub struct int8x8x4_t(pub int8x8_t, pub int8x8_t, pub int8x8_t, pub int8x8_t);

+/// ARM-specific type containing two `int8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x16x2_t(pub int8x16_t, pub int8x16_t);
+/// ARM-specific type containing three `int8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x16x3_t(pub int8x16_t, pub int8x16_t, pub int8x16_t);
+/// ARM-specific type containing four `int8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int8x16x4_t(pub int8x16_t, pub int8x16_t, pub int8x16_t, pub int8x16_t);
+
 /// ARM-specific type containing two `uint8x8_t` vectors.
 #[derive(Copy, Clone)]
 pub struct uint8x8x2_t(pub uint8x8_t, pub uint8x8_t);
@ -102,6 +112,21 @@ pub struct uint8x8x3_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);
 #[derive(Copy, Clone)]
 pub struct uint8x8x4_t(pub uint8x8_t, pub uint8x8_t, pub uint8x8_t, pub uint8x8_t);

+/// ARM-specific type containing two `uint8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x16x2_t(pub uint8x16_t, pub uint8x16_t);
+/// ARM-specific type containing three `uint8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x16x3_t(pub uint8x16_t, pub uint8x16_t, pub uint8x16_t);
+/// ARM-specific type containing four `uint8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint8x16x4_t(
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+    pub uint8x16_t,
+);
+
 /// ARM-specific type containing two `poly8x8_t` vectors.
 #[derive(Copy, Clone)]
 pub struct poly8x8x2_t(pub poly8x8_t, pub poly8x8_t);
@ -112,8 +137,233 @@ pub struct poly8x8x3_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);
 #[derive(Copy, Clone)]
 pub struct poly8x8x4_t(pub poly8x8_t, pub poly8x8_t, pub poly8x8_t, pub poly8x8_t);

+/// ARM-specific type containing two `poly8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x16x2_t(pub poly8x16_t, pub poly8x16_t);
+/// ARM-specific type containing three `poly8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x16x3_t(pub poly8x16_t, pub poly8x16_t, pub poly8x16_t);
+/// ARM-specific type containing four `poly8x16_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly8x16x4_t(
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+    pub poly8x16_t,
+);
+
+/// ARM-specific type containing two `int16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int16x4x2_t(pub int16x4_t, pub int16x4_t);
+/// ARM-specific type containing three `int16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int16x4x3_t(pub int16x4_t, pub int16x4_t, pub int16x4_t);
+/// ARM-specific type containing four `int16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int16x4x4_t(pub int16x4_t, pub int16x4_t, pub int16x4_t, pub int16x4_t);
+
+/// ARM-specific type containing two `int16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int16x8x2_t(pub int16x8_t, pub int16x8_t);
+/// ARM-specific type containing three `int16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int16x8x3_t(pub int16x8_t, pub int16x8_t, pub int16x8_t);
+/// ARM-specific type containing four `int16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int16x8x4_t(pub int16x8_t, pub int16x8_t, pub int16x8_t, pub int16x8_t);
+
+/// ARM-specific type containing two `uint16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint16x4x2_t(pub uint16x4_t, pub uint16x4_t);
+/// ARM-specific type containing three `uint16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint16x4x3_t(pub uint16x4_t, pub uint16x4_t, pub uint16x4_t);
+/// ARM-specific type containing four `uint16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint16x4x4_t(
+    pub uint16x4_t,
+    pub uint16x4_t,
+    pub uint16x4_t,
+    pub uint16x4_t,
+);
+
+/// ARM-specific type containing two `uint16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint16x8x2_t(pub uint16x8_t, pub uint16x8_t);
+/// ARM-specific type containing three `uint16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint16x8x3_t(pub uint16x8_t, pub uint16x8_t, pub uint16x8_t);
+/// ARM-specific type containing four `uint16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint16x8x4_t(
+    pub uint16x8_t,
+    pub uint16x8_t,
+    pub uint16x8_t,
+    pub uint16x8_t,
+);
+
+/// ARM-specific type containing two `poly16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly16x4x2_t(pub poly16x4_t, pub poly16x4_t);
+/// ARM-specific type containing three `poly16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly16x4x3_t(pub poly16x4_t, pub poly16x4_t, pub poly16x4_t);
+/// ARM-specific type containing four `poly16x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly16x4x4_t(
+    pub poly16x4_t,
+    pub poly16x4_t,
+    pub poly16x4_t,
+    pub poly16x4_t,
+);
+
+/// ARM-specific type containing two `poly16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly16x8x2_t(pub poly16x8_t, pub poly16x8_t);
+/// ARM-specific type containing three `poly16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly16x8x3_t(pub poly16x8_t, pub poly16x8_t, pub poly16x8_t);
+/// ARM-specific type containing four `poly16x8_t` vectors.
+#[derive(Copy, Clone)]
+pub struct poly16x8x4_t(
+    pub poly16x8_t,
+    pub poly16x8_t,
+    pub poly16x8_t,
+    pub poly16x8_t,
+);
+
+/// ARM-specific type containing two `int32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int32x2x2_t(pub int32x2_t, pub int32x2_t);
+/// ARM-specific type containing three `int32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int32x2x3_t(pub int32x2_t, pub int32x2_t, pub int32x2_t);
+/// ARM-specific type containing four `int32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int32x2x4_t(pub int32x2_t, pub int32x2_t, pub int32x2_t, pub int32x2_t);
+
+/// ARM-specific type containing two `int32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int32x4x2_t(pub int32x4_t, pub int32x4_t);
+/// ARM-specific type containing three `int32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int32x4x3_t(pub int32x4_t, pub int32x4_t, pub int32x4_t);
+/// ARM-specific type containing four `int32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int32x4x4_t(pub int32x4_t, pub int32x4_t, pub int32x4_t, pub int32x4_t);
+
+/// ARM-specific type containing two `uint32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint32x2x2_t(pub uint32x2_t, pub uint32x2_t);
+/// ARM-specific type containing three `uint32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint32x2x3_t(pub uint32x2_t, pub uint32x2_t, pub uint32x2_t);
+/// ARM-specific type containing four `uint32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint32x2x4_t(
+    pub uint32x2_t,
+    pub uint32x2_t,
+    pub uint32x2_t,
+    pub uint32x2_t,
+);
+
+/// ARM-specific type containing two `uint32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint32x4x2_t(pub uint32x4_t, pub uint32x4_t);
+/// ARM-specific type containing three `uint32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint32x4x3_t(pub uint32x4_t, pub uint32x4_t, pub uint32x4_t);
+/// ARM-specific type containing four `uint32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint32x4x4_t(
+    pub uint32x4_t,
+    pub uint32x4_t,
+    pub uint32x4_t,
+    pub uint32x4_t,
+);
+
+/// ARM-specific type containing two `float32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct float32x2x2_t(pub float32x2_t, pub float32x2_t);
+/// ARM-specific type containing three `float32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct float32x2x3_t(pub float32x2_t, pub float32x2_t, pub float32x2_t);
+/// ARM-specific type containing four `float32x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct float32x2x4_t(
+    pub float32x2_t,
+    pub float32x2_t,
+    pub float32x2_t,
+    pub float32x2_t,
+);
+
+/// ARM-specific type containing two `float32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct float32x4x2_t(pub float32x4_t, pub float32x4_t);
+/// ARM-specific type containing three `float32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct float32x4x3_t(pub float32x4_t, pub float32x4_t, pub float32x4_t);
+/// ARM-specific type containing four `float32x4_t` vectors.
+#[derive(Copy, Clone)]
+pub struct float32x4x4_t(
+    pub float32x4_t,
+    pub float32x4_t,
+    pub float32x4_t,
+    pub float32x4_t,
+);
+
+/// ARM-specific type containing four `int64x1_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int64x1x2_t(pub int64x1_t, pub int64x1_t);
+/// ARM-specific type containing four `int64x1_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int64x1x3_t(pub int64x1_t, pub int64x1_t, pub int64x1_t);
+/// ARM-specific type containing four `int64x1_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int64x1x4_t(pub int64x1_t, pub int64x1_t, pub int64x1_t, pub int64x1_t);
+
+/// ARM-specific type containing four `int64x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int64x2x2_t(pub int64x2_t, pub int64x2_t);
+/// ARM-specific type containing four `int64x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int64x2x3_t(pub int64x2_t, pub int64x2_t, pub int64x2_t);
+/// ARM-specific type containing four `int64x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct int64x2x4_t(pub int64x2_t, pub int64x2_t, pub int64x2_t, pub int64x2_t);
+
+/// ARM-specific type containing four `uint64x1_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint64x1x2_t(pub uint64x1_t, pub uint64x1_t);
+/// ARM-specific type containing four `uint64x1_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint64x1x3_t(pub uint64x1_t, pub uint64x1_t, pub uint64x1_t);
+/// ARM-specific type containing four `uint64x1_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint64x1x4_t(
+    pub uint64x1_t,
+    pub uint64x1_t,
+    pub uint64x1_t,
+    pub uint64x1_t,
+);
+
+/// ARM-specific type containing four `uint64x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint64x2x2_t(pub uint64x2_t, pub uint64x2_t);
+/// ARM-specific type containing four `uint64x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint64x2x3_t(pub uint64x2_t, pub uint64x2_t, pub uint64x2_t);
+/// ARM-specific type containing four `uint64x2_t` vectors.
+#[derive(Copy, Clone)]
+pub struct uint64x2x4_t(
+    pub uint64x2_t,
+    pub uint64x2_t,
+    pub uint64x2_t,
+    pub uint64x2_t,
+);
+
 #[allow(improper_ctypes)]
-extern "C" {
+extern "unadjusted" {
    // absolute value (64-bit)
    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vabs.v8i8")]
    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v8i8")]
@ -2867,11 +3117,7 @@ pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 1))]
-// Based on the discussion in https://github.com/rust-lang/stdarch/pull/792
-// `mov` seems to be an acceptable intrinsic to compile to
-// #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(vmov, IMM5 = 1))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
 pub unsafe fn vgetq_lane_u64<const IMM5: i32>(v: uint64x2_t) -> u64 {
    static_assert_imm1!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2882,10 +3128,7 @@ pub unsafe fn vgetq_lane_u64<const IMM5: i32>(v: uint64x2_t) -> u64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov, IMM5 = 0))]
-// FIXME: no 32bit this seems to be turned into two vmov.32 instructions
-// validate correctness
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vget_lane_u64<const IMM5: i32>(v: uint64x1_t) -> u64 {
    static_assert!(IMM5 : i32 where IMM5 == 0);
    simd_extract(v, 0)
@ -2896,8 +3139,7 @@ pub unsafe fn vget_lane_u64<const IMM5: i32>(v: uint64x1_t) -> u64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u16", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vget_lane_u16<const IMM5: i32>(v: uint16x4_t) -> u16 {
    static_assert_imm2!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2908,8 +3150,7 @@ pub unsafe fn vget_lane_u16<const IMM5: i32>(v: uint16x4_t) -> u16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.s16", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vget_lane_s16<const IMM5: i32>(v: int16x4_t) -> i16 {
    static_assert_imm2!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2920,8 +3161,7 @@ pub unsafe fn vget_lane_s16<const IMM5: i32>(v: int16x4_t) -> i16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u16", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vget_lane_p16<const IMM5: i32>(v: poly16x4_t) -> p16 {
    static_assert_imm2!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2932,8 +3172,7 @@ pub unsafe fn vget_lane_p16<const IMM5: i32>(v: poly16x4_t) -> p16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 1))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
 pub unsafe fn vget_lane_u32<const IMM5: i32>(v: uint32x2_t) -> u32 {
    static_assert_imm1!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2944,8 +3183,7 @@ pub unsafe fn vget_lane_u32<const IMM5: i32>(v: uint32x2_t) -> u32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 1))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
 pub unsafe fn vget_lane_s32<const IMM5: i32>(v: int32x2_t) -> i32 {
    static_assert_imm1!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2956,8 +3194,7 @@ pub unsafe fn vget_lane_s32<const IMM5: i32>(v: int32x2_t) -> i32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.f32", IMM5 = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 1))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
 pub unsafe fn vget_lane_f32<const IMM5: i32>(v: float32x2_t) -> f32 {
    static_assert_imm1!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2968,8 +3205,7 @@ pub unsafe fn vget_lane_f32<const IMM5: i32>(v: float32x2_t) -> f32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.f32", IMM5 = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 1))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 1))]
 pub unsafe fn vgetq_lane_f32<const IMM5: i32>(v: float32x4_t) -> f32 {
    static_assert_imm2!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -2980,8 +3216,7 @@ pub unsafe fn vgetq_lane_f32<const IMM5: i32>(v: float32x4_t) -> f32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov, IMM5 = 0))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vget_lane_p64<const IMM5: i32>(v: poly64x1_t) -> p64 {
    static_assert!(IMM5 : i32 where IMM5 == 0);
    simd_extract(v, IMM5 as u32)
@ -2992,8 +3227,7 @@ pub unsafe fn vget_lane_p64<const IMM5: i32>(v: poly64x1_t) -> p64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov, IMM5 = 0))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vgetq_lane_p64<const IMM5: i32>(v: poly64x2_t) -> p64 {
    static_assert_imm1!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3004,8 +3238,7 @@ pub unsafe fn vgetq_lane_p64<const IMM5: i32>(v: poly64x2_t) -> p64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov, IMM5 = 0))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vget_lane_s64<const IMM5: i32>(v: int64x1_t) -> i64 {
    static_assert!(IMM5 : i32 where IMM5 == 0);
    simd_extract(v, IMM5 as u32)
@ -3016,8 +3249,7 @@ pub unsafe fn vget_lane_s64<const IMM5: i32>(v: int64x1_t) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 0))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmov, IMM5 = 0))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vgetq_lane_s64<const IMM5: i32>(v: int64x2_t) -> i64 {
    static_assert_imm1!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3028,8 +3260,7 @@ pub unsafe fn vgetq_lane_s64<const IMM5: i32>(v: int64x2_t) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u16", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_u16<const IMM5: i32>(v: uint16x8_t) -> u16 {
    static_assert_imm3!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3040,8 +3271,7 @@ pub unsafe fn vgetq_lane_u16<const IMM5: i32>(v: uint16x8_t) -> u16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_u32<const IMM5: i32>(v: uint32x4_t) -> u32 {
    static_assert_imm2!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3052,8 +3282,7 @@ pub unsafe fn vgetq_lane_u32<const IMM5: i32>(v: uint32x4_t) -> u32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.s16", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_s16<const IMM5: i32>(v: int16x8_t) -> i16 {
    static_assert_imm3!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3064,8 +3293,7 @@ pub unsafe fn vgetq_lane_s16<const IMM5: i32>(v: int16x8_t) -> i16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u16", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_p16<const IMM5: i32>(v: poly16x8_t) -> p16 {
    static_assert_imm3!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3076,8 +3304,7 @@ pub unsafe fn vgetq_lane_p16<const IMM5: i32>(v: poly16x8_t) -> p16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.32", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_s32<const IMM5: i32>(v: int32x4_t) -> i32 {
    static_assert_imm2!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3088,8 +3315,7 @@ pub unsafe fn vgetq_lane_s32<const IMM5: i32>(v: int32x4_t) -> i32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u8", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
    static_assert_imm3!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3100,8 +3326,7 @@ pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.s8", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vget_lane_s8<const IMM5: i32>(v: int8x8_t) -> i8 {
    static_assert_imm3!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3112,8 +3337,7 @@ pub unsafe fn vget_lane_s8<const IMM5: i32>(v: int8x8_t) -> i8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u8", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vget_lane_p8<const IMM5: i32>(v: poly8x8_t) -> p8 {
    static_assert_imm3!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3124,8 +3348,7 @@ pub unsafe fn vget_lane_p8<const IMM5: i32>(v: poly8x8_t) -> p8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u8", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_u8<const IMM5: i32>(v: uint8x16_t) -> u8 {
    static_assert_imm4!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3136,8 +3359,7 @@ pub unsafe fn vgetq_lane_u8<const IMM5: i32>(v: uint8x16_t) -> u8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.s8", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_s8<const IMM5: i32>(v: int8x16_t) -> i8 {
    static_assert_imm4!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3148,8 +3370,7 @@ pub unsafe fn vgetq_lane_s8<const IMM5: i32>(v: int8x16_t) -> i8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[rustc_legacy_const_generics(1)]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov.u8", IMM5 = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umov, IMM5 = 2))]
+#[cfg_attr(test, assert_instr(nop, IMM5 = 2))]
 pub unsafe fn vgetq_lane_p8<const IMM5: i32>(v: poly8x16_t) -> p8 {
    static_assert_imm4!(IMM5);
    simd_extract(v, IMM5 as u32)
@ -3269,8 +3490,7 @@ pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }
@ -3279,8 +3499,7 @@ pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }
@ -3289,8 +3508,7 @@ pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
    simd_shuffle2!(a, a, [0, 1])
 }
@ -3299,8 +3517,7 @@ pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
    int64x1_t(simd_extract(a, 0))
 }
@ -3309,8 +3526,7 @@ pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }
@ -3319,8 +3535,7 @@ pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }
@ -3329,8 +3544,7 @@ pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
    simd_shuffle2!(a, a, [0, 1])
 }
@ -3339,8 +3553,7 @@ pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
    uint64x1_t(simd_extract(a, 0))
 }
@ -3349,8 +3562,7 @@ pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }
@ -3359,8 +3571,7 @@ pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }
@ -3369,8 +3580,7 @@ pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+#[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vget_low_f32(a: float32x4_t) -> float32x2_t {
    simd_shuffle2!(a, a, [0, 1])
 }
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@ -2033,6 +2033,81 @@ aarch64 = sqadd
 link-aarch64 = sqadd._EXT_
 generate i32, i64

+/// Load multiple single-element structures to one, two, three, or four registers
+name = vld1
+out-suffix
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+test = load_test
+
+aarch64 = ld1
+link-aarch64 = ld1x2._EXT2_
+arm = vld1
+link-arm = vld1x2._EXT2_
+generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
+generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
+
+link-aarch64 = ld1x3._EXT2_
+link-arm = vld1x3._EXT2_
+generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t
+generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t
+
+link-aarch64 = ld1x4._EXT2_
+link-arm = vld1x4._EXT2_
+generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t
+generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t
+
+/// Load multiple single-element structures to one, two, three, or four registers
+name = vld1
+out-suffix
+multi_fn = transmute, {vld1-outsigned-noext, transmute(a)}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+
+test = load_test
+aarch64 = ld1
+arm = vld1
+generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
+generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
+generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t
+generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t
+generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t
+generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t
+generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t
+generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t
+generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t
+generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t
+
+/// Load multiple single-element structures to one, two, three, or four registers
+name = vld1
+out-suffix
+a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
+test = load_test
+
+aarch64 = ld1
+link-aarch64 = ld1x2._EXT2_
+generate *const f64:float64x1x2_t, *const f64:float64x2x2_t
+
+link-aarch64 = ld1x3._EXT2_
+generate *const f64:float64x1x3_t, *const f64:float64x2x3_t
+
+link-aarch64 = ld1x4._EXT2_
+generate *const f64:float64x1x4_t, *const f64:float64x2x4_t
+
+arm = vld1
+link-aarch64 = ld1x2._EXT2_
+link-arm = vld1x2._EXT2_
+generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
+
+link-aarch64 = ld1x3._EXT2_
+link-arm = vld1x3._EXT2_
+generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
+
+link-aarch64 = ld1x4._EXT2_
+link-arm = vld1x4._EXT2_
+generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
+
 /// Multiply
 name = vmul
 a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@ -51,38 +51,34 @@ const FLOAT_TYPES_64: [&str; 2] = [
 ];

 fn type_len(t: &str) -> usize {
-    match t {
-        "int8x8_t" => 8,
-        "int8x16_t" => 16,
-        "int16x4_t" => 4,
-        "int16x8_t" => 8,
-        "int32x2_t" => 2,
-        "int32x4_t" => 4,
-        "int64x1_t" => 1,
-        "int64x2_t" => 2,
-        "uint8x8_t" => 8,
-        "uint8x16_t" => 16,
-        "uint16x4_t" => 4,
-        "uint16x8_t" => 8,
-        "uint32x2_t" => 2,
-        "uint32x4_t" => 4,
-        "uint64x1_t" => 1,
-        "uint64x2_t" => 2,
-        "float16x4_t" => 4,
-        "float16x8_t" => 8,
-        "float32x2_t" => 2,
-        "float32x4_t" => 4,
-        "float64x1_t" => 1,
-        "float64x2_t" => 2,
-        "poly8x8_t" => 8,
-        "poly8x16_t" => 16,
-        "poly16x4_t" => 4,
-        "poly16x8_t" => 8,
-        "poly64x1_t" => 1,
-        "poly64x2_t" => 2,
-        "i8" | "i16" | "i32" | "i64" | "u8" | "u16" | "u32" | "u64" | "f32" | "f64" | "p8"
-        | "p16" | "p64" | "p128" => 1,
-        _ => panic!("unknown type: {}", t),
+    let s: Vec<_> = t.split("x").collect();
+    if s.len() == 2 {
+        match &s[1][0..2] {
+            "1_" => 1,
+            "2_" => 2,
+            "4_" => 4,
+            "8_" => 8,
+            "16" => 16,
+            _ => panic!("unknown type: {}", t),
+        }
+    } else if s.len() == 3 {
+        s[1].parse::<usize>().unwrap() * type_sub_len(t)
+    } else {
+        1
+    }
+}
+
+fn type_sub_len(t: &str) -> usize {
+    let s: Vec<_> = t.split('x').collect();
+    if s.len() != 3 {
+        1
+    } else {
+        match s[2] {
+            "2_t" => 2,
+            "3_t" => 3,
+            "4_t" => 4,
+            _ => panic!("unknown type len: {}", t),
+        }
    }
 }

@ -177,6 +173,84 @@ fn type_to_suffix(t: &str) -> &str {
        "poly16x8_t" => "q_p16",
        "poly64x1_t" => "_p64",
        "poly64x2_t" => "q_p64",
+        "int8x8x2_t" => "_s8_x2",
+        "int8x8x3_t" => "_s8_x3",
+        "int8x8x4_t" => "_s8_x4",
+        "int16x4x2_t" => "_s16_x2",
+        "int16x4x3_t" => "_s16_x3",
+        "int16x4x4_t" => "_s16_x4",
+        "int32x2x2_t" => "_s32_x2",
+        "int32x2x3_t" => "_s32_x3",
+        "int32x2x4_t" => "_s32_x4",
+        "int64x1x2_t" => "_s64_x2",
+        "int64x1x3_t" => "_s64_x3",
+        "int64x1x4_t" => "_s64_x4",
+        "uint8x8x2_t" => "_u8_x2",
+        "uint8x8x3_t" => "_u8_x3",
+        "uint8x8x4_t" => "_u8_x4",
+        "uint16x4x2_t" => "_u16_x2",
+        "uint16x4x3_t" => "_u16_x3",
+        "uint16x4x4_t" => "_u16_x4",
+        "uint32x2x2_t" => "_u32_x2",
+        "uint32x2x3_t" => "_u32_x3",
+        "uint32x2x4_t" => "_u32_x4",
+        "uint64x1x2_t" => "_u64_x2",
+        "uint64x1x3_t" => "_u64_x3",
+        "uint64x1x4_t" => "_u64_x4",
+        "poly8x8x2_t" => "_p8_x2",
+        "poly8x8x3_t" => "_p8_x3",
+        "poly8x8x4_t" => "_p8_x4",
+        "poly16x4x2_t" => "_p16_x2",
+        "poly16x4x3_t" => "_p16_x3",
+        "poly16x4x4_t" => "_p16_x4",
+        "poly64x1x2_t" => "_p64_x2",
+        "poly64x1x3_t" => "_p64_x3",
+        "poly64x1x4_t" => "_p64_x4",
+        "float32x2x2_t" => "_f32_x2",
+        "float32x2x3_t" => "_f32_x3",
+        "float32x2x4_t" => "_f32_x4",
+        "float64x1x2_t" => "_f64_x2",
+        "float64x1x3_t" => "_f64_x3",
+        "float64x1x4_t" => "_f64_x4",
+        "int8x16x2_t" => "q_s8_x2",
+        "int8x16x3_t" => "q_s8_x3",
+        "int8x16x4_t" => "q_s8_x4",
+        "int16x8x2_t" => "q_s16_x2",
+        "int16x8x3_t" => "q_s16_x3",
+        "int16x8x4_t" => "q_s16_x4",
+        "int32x4x2_t" => "q_s32_x2",
+        "int32x4x3_t" => "q_s32_x3",
+        "int32x4x4_t" => "q_s32_x4",
+        "int64x2x2_t" => "q_s64_x2",
+        "int64x2x3_t" => "q_s64_x3",
+        "int64x2x4_t" => "q_s64_x4",
+        "uint8x16x2_t" => "q_u8_x2",
+        "uint8x16x3_t" => "q_u8_x3",
+        "uint8x16x4_t" => "q_u8_x4",
+        "uint16x8x2_t" => "q_u16_x2",
+        "uint16x8x3_t" => "q_u16_x3",
+        "uint16x8x4_t" => "q_u16_x4",
+        "uint32x4x2_t" => "q_u32_x2",
+        "uint32x4x3_t" => "q_u32_x3",
+        "uint32x4x4_t" => "q_u32_x4",
+        "uint64x2x2_t" => "q_u64_x2",
+        "uint64x2x3_t" => "q_u64_x3",
+        "uint64x2x4_t" => "q_u64_x4",
+        "poly8x16x2_t" => "q_p8_x2",
+        "poly8x16x3_t" => "q_p8_x3",
+        "poly8x16x4_t" => "q_p8_x4",
+        "poly16x8x2_t" => "q_p16_x2",
+        "poly16x8x3_t" => "q_p16_x3",
+        "poly16x8x4_t" => "q_p16_x4",
+        "poly64x2x2_t" => "q_p64_x2",
+        "poly64x2x3_t" => "q_p64_x3",
+        "poly64x2x4_t" => "q_p64_x4",
+        "float32x4x2_t" => "q_f32_x2",
+        "float32x4x3_t" => "q_f32_x3",
+        "float32x4x4_t" => "q_f32_x4",
+        "float64x2x2_t" => "q_f64_x2",
+        "float64x2x3_t" => "q_f64_x3",
+        "float64x2x4_t" => "q_f64_x4",
        "i8" => "b_s8",
        "i16" => "h_s16",
        "i32" => "s_s32",
@ -274,18 +348,10 @@ fn type_to_lane_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String {
    str
 }

-fn type_to_signed(t: &str) -> &str {
-    match t {
-        "int8x8_t" | "uint8x8_t" | "poly8x8_t" => "int8x8_t",
-        "int8x16_t" | "uint8x16_t" | "poly8x16_t" => "int8x16_t",
-        "int16x4_t" | "uint16x4_t" | "poly16x4_t" => "int16x4_t",
-        "int16x8_t" | "uint16x8_t" | "poly16x8_t" => "int16x8_t",
-        "int32x2_t" | "uint32x2_t" => "int32x2_t",
-        "int32x4_t" | "uint32x4_t" => "int32x4_t",
-        "int64x1_t" | "uint64x1_t" | "poly64x1_t" => "int64x1_t",
-        "int64x2_t" | "uint64x2_t" | "poly64x2_t" => "int64x2_t",
-        _ => panic!("unknown type: {}", t),
-    }
+fn type_to_signed(t: &String) -> String {
+    let s = t.replace("uint", "int");
+    let s = s.replace("poly", "int");
+    s
 }

 fn type_to_unsigned(t: &str) -> &str {
@ -384,34 +450,34 @@ enum TargetFeature {

 fn type_to_global_type(t: &str) -> &str {
    match t {
-        "int8x8_t" => "i8x8",
-        "int8x16_t" => "i8x16",
-        "int16x4_t" => "i16x4",
-        "int16x8_t" => "i16x8",
-        "int32x2_t" => "i32x2",
-        "int32x4_t" => "i32x4",
-        "int64x1_t" => "i64x1",
-        "int64x2_t" => "i64x2",
-        "uint8x8_t" => "u8x8",
-        "uint8x16_t" => "u8x16",
-        "uint16x4_t" => "u16x4",
-        "uint16x8_t" => "u16x8",
-        "uint32x2_t" => "u32x2",
-        "uint32x4_t" => "u32x4",
-        "uint64x1_t" => "u64x1",
-        "uint64x2_t" => "u64x2",
+        "int8x8_t" | "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t" => "i8x8",
+        "int8x16_t" | "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "i8x16",
+        "int16x4_t" | "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t" => "i16x4",
+        "int16x8_t" | "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "i16x8",
+        "int32x2_t" | "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t" => "i32x2",
+        "int32x4_t" | "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "i32x4",
+        "int64x1_t" | "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t" => "i64x1",
+        "int64x2_t" | "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "i64x2",
+        "uint8x8_t" | "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t" => "u8x8",
+        "uint8x16_t" | "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "u8x16",
+        "uint16x4_t" | "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t" => "u16x4",
+        "uint16x8_t" | "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "u16x8",
+        "uint32x2_t" | "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t" => "u32x2",
+        "uint32x4_t" | "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "u32x4",
+        "uint64x1_t" | "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t" => "u64x1",
+        "uint64x2_t" | "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "u64x2",
        "float16x4_t" => "f16x4",
        "float16x8_t" => "f16x8",
-        "float32x2_t" => "f32x2",
-        "float32x4_t" => "f32x4",
-        "float64x1_t" => "f64",
-        "float64x2_t" => "f64x2",
-        "poly8x8_t" => "i8x8",
-        "poly8x16_t" => "i8x16",
-        "poly16x4_t" => "i16x4",
-        "poly16x8_t" => "i16x8",
-        "poly64x1_t" => "i64x1",
-        "poly64x2_t" => "i64x2",
+        "float32x2_t" | "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t" => "f32x2",
+        "float32x4_t" | "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "f32x4",
+        "float64x1_t" | "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t" => "f64",
+        "float64x2_t" | "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "f64x2",
+        "poly8x8_t" | "poly8x8x2_t" | "poly8x8x3_t" | "poly8x8x4_t" => "i8x8",
+        "poly8x16_t" | "poly8x16x2_t" | "poly8x16x3_t" | "poly8x16x4_t" => "i8x16",
+        "poly16x4_t" | "poly16x4x2_t" | "poly16x4x3_t" | "poly16x4x4_t" => "i16x4",
+        "poly16x8_t" | "poly16x8x2_t" | "poly16x8x3_t" | "poly16x8x4_t" => "i16x8",
+        "poly64x1_t" | "poly64x1x2_t" | "poly64x1x3_t" | "poly64x1x4_t" => "i64x1",
+        "poly64x2_t" | "poly64x2x2_t" | "poly64x2x3_t" | "poly64x2x4_t" => "i64x2",
        "i8" => "i8",
        "i16" => "i16",
        "i32" => "i32",
@ -432,18 +498,33 @@ fn type_to_global_type(t: &str) -> &str {

 fn type_to_native_type(t: &str) -> &str {
    match t {
-        "int8x8_t" | "int8x16_t" | "i8" => "i8",
-        "int16x4_t" | "int16x8_t" | "i16" => "i16",
-        "int32x2_t" | "int32x4_t" | "i32" => "i32",
-        "int64x1_t" | "int64x2_t" | "i64" => "i64",
-        "uint8x8_t" | "uint8x16_t" | "u8" => "u8",
-        "uint16x4_t" | "uint16x8_t" | "u16" => "u16",
-        "uint32x2_t" | "uint32x4_t" | "u32" => "u32",
-        "uint64x1_t" | "uint64x2_t" | "u64" => "u64",
+        "int8x8_t" | "int8x16_t" | "i8" | "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t"
+        | "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "i8",
+        "int16x4_t" | "int16x8_t" | "i16" | "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t"
+        | "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "i16",
+        "int32x2_t" | "int32x4_t" | "i32" | "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t"
+        | "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "i32",
+        "int64x1_t" | "int64x2_t" | "i64" | "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t"
+        | "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "i64",
+        "uint8x8_t" | "uint8x16_t" | "u8" | "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t"
+        | "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "u8",
+        "uint16x4_t" | "uint16x8_t" | "u16" | "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t"
+        | "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "u16",
+        "uint32x2_t" | "uint32x4_t" | "u32" | "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t"
+        | "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "u32",
+        "uint64x1_t" | "uint64x2_t" | "u64" | "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t"
+        | "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "u64",
        "float16x4_t" | "float16x8_t" => "f16",
-        "float32x2_t" | "float32x4_t" => "f32",
-        "float64x1_t" | "float64x2_t" => "f64",
-        "poly64x1_t" | "poly64x2_t" => "u64",
+        "float32x2_t" | "float32x4_t" | "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t"
+        | "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "f32",
+        "float64x1_t" | "float64x2_t" | "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t"
+        | "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "f64",
+        "poly8x8_t" | "poly8x16_t" | "poly8x8x2_t" | "poly8x8x3_t" | "poly8x8x4_t"
+        | "poly8x16x2_t" | "poly8x16x3_t" | "poly8x16x4_t" => "u8",
+        "poly16x4_t" | "poly16x8_t" | "poly16x4x2_t" | "poly16x4x3_t" | "poly16x4x4_t"
+        | "poly16x8x2_t" | "poly16x8x3_t" | "poly16x8x4_t" => "u16",
+        "poly64x1_t" | "poly64x2_t" | "poly64x1x2_t" | "poly64x1x3_t" | "poly64x1x4_t"
+        | "poly64x2x2_t" | "poly64x2x3_t" | "poly64x2x4_t" => "u64",
        _ => panic!("unknown type: {}", t),
    }
 }
@ -510,6 +591,26 @@ fn type_to_ext(t: &str) -> &str {
        "poly8x16_t" => "v16i8",
        "poly16x4_t" => "v4i16",
        "poly16x8_t" => "v8i16",
+        "int8x8x2_t" | "int8x8x3_t" | "int8x8x4_t" => "v8i8.p0i8",
+        "int16x4x2_t" | "int16x4x3_t" | "int16x4x4_t" => "v4i16.p0i16",
+        "int32x2x2_t" | "int32x2x3_t" | "int32x2x4_t" => "v2i32.p0i32",
+        "int64x1x2_t" | "int64x1x3_t" | "int64x1x4_t" => "v1i64.p0i64",
+        "uint8x8x2_t" | "uint8x8x3_t" | "uint8x8x4_t" => "v8i8.p0i8",
+        "uint16x4x2_t" | "uint16x4x3_t" | "uint16x4x4_t" => "v4i16.p0i16",
+        "uint32x2x2_t" | "uint32x2x3_t" | "uint32x2x4_t" => "v2i32.p0i32",
+        "uint64x1x2_t" | "uint64x1x3_t" | "uint64x1x4_t" => "v1i64.p0i64",
+        "float32x2x2_t" | "float32x2x3_t" | "float32x2x4_t" => "v2f32.p0f32",
+        "float64x1x2_t" | "float64x1x3_t" | "float64x1x4_t" => "v1f64.p0f64",
+        "int8x16x2_t" | "int8x16x3_t" | "int8x16x4_t" => "v16i8.p0i8",
+        "int16x8x2_t" | "int16x8x3_t" | "int16x8x4_t" => "v8i16.p0i16",
+        "int32x4x2_t" | "int32x4x3_t" | "int32x4x4_t" => "v4i32.p0i32",
+        "int64x2x2_t" | "int64x2x3_t" | "int64x2x4_t" => "v2i64.p0i64",
+        "uint8x16x2_t" | "uint8x16x3_t" | "uint8x16x4_t" => "v16i8.p0i8",
+        "uint16x8x2_t" | "uint16x8x3_t" | "uint16x8x4_t" => "v8i16.p0i16",
+        "uint32x4x2_t" | "uint32x4x3_t" | "uint32x4x4_t" => "v4i32.p0i32",
+        "uint64x2x2_t" | "uint64x2x3_t" | "uint64x2x4_t" => "v2i64.p0i64",
+        "float32x4x2_t" | "float32x4x3_t" | "float32x4x4_t" => "v4f32.p0f32",
+        "float64x2x2_t" | "float64x2x3_t" | "float64x2x4_t" => "v2f64.p0f64",
        "i8" => "i8",
        "i16" => "i16",
        "i32" => "i32",
@ -522,6 +623,16 @@ fn type_to_ext(t: &str) -> &str {
        "f64" => "f64",
        "p64" => "p64",
        "p128" => "p128",
+        "*const i8" => "i8",
+        "*const i16" => "i16",
+        "*const i32" => "i32",
+        "*const i64" => "i64",
+        "*const u8" => "i8",
+        "*const u16" => "i16",
+        "*const u32" => "i32",
+        "*const u64" => "i64",
+        "*const f32" => "f32",
+        "*const f64" => "f64",
        /*
        "poly64x1_t" => "i64x1",
        "poly64x2_t" => "i64x2",
@ -858,9 +969,8 @@ fn gen_aarch64(
    target: TargetFeature,
    fixed: &Vec<String>,
    multi_fn: &Vec<String>,
+    test_fn: &str,
 ) -> (String, String) {
-    let _global_t = type_to_global_type(in_t[0]);
-    let _global_ret_t = type_to_global_type(out_t);
    let name = match suffix {
        Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])),
        NoQ => format!("{}{}", current_name, type_to_noq_suffix(in_t[1])),
@ -941,7 +1051,7 @@ fn gen_aarch64(
        };
        ext_c = format!(
            r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
@ -965,7 +1075,7 @@ fn gen_aarch64(
        if const_aarch64.is_some() {
            ext_c_const = format!(
                r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
@ -1162,17 +1272,94 @@ fn gen_aarch64(
        current_comment, current_target, current_aarch64, const_assert, const_legacy, call
    );

-    let test = gen_test(
-        &name,
-        in_t,
-        &out_t,
-        current_tests,
-        [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
-        type_len(out_t),
-        para_num,
-    );
+    let test = if test_fn == "load_test" {
+        gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t))
+    } else {
+        gen_test(
+            &name,
+            in_t,
+            &out_t,
+            current_tests,
+            [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
+            type_len(out_t),
+            para_num,
+        )
+    };
    (function, test)
 }
+fn gen_load_test(
+    name: &str,
+    _in_t: &[&str; 3],
+    out_t: &str,
+    current_tests: &[(
+        Vec<String>,
+        Vec<String>,
+        Vec<String>,
+        Option<String>,
+        Vec<String>,
+    )],
+    len_out: usize,
+) -> String {
+    let mut test = format!(
+        r#"
+    #[simd_test(enable = "neon")]
+    unsafe fn test_{}() {{"#,
+        name,
+    );
+    for (a, _, _, _, e) in current_tests {
+        let a: Vec<String> = a.iter().take(len_out + 1).cloned().collect();
+        let e: Vec<String> = e.iter().take(len_out).cloned().collect();
+        let mut input = String::from("[");
+        for i in 0..type_len(out_t) + 1 {
+            if i != 0 {
+                input.push_str(", ");
+            }
+            input.push_str(&a[i])
+        }
+        input.push_str("]");
+        let mut output = String::from("[");
+        for i in 0..type_sub_len(out_t) {
+            if i != 0 {
+                output.push_str(", ");
+            }
+            let sub_len = type_len(out_t) / type_sub_len(out_t);
+            if type_to_global_type(out_t) != "f64" {
+                let mut sub_output = format!("{}::new(", type_to_global_type(out_t));
+                for j in 0..sub_len {
+                    if j != 0 {
+                        sub_output.push_str(", ");
+                    }
+                    sub_output.push_str(&e[i * sub_len + j]);
+                }
+                sub_output.push_str(")");
+                output.push_str(&sub_output);
+            } else {
+                output.push_str(&e[i]);
+            }
+        }
+        output.push_str("]");
+        let t = format!(
+            r#"
+        let a: [{}; {}] = {};
+        let e: [{}; {}] = {};
+        let r: [{}; {}] = transmute({}(a[1..].as_ptr()));
+        assert_eq!(r, e);
+"#,
+            type_to_native_type(out_t),
+            type_len(out_t) + 1,
+            input,
+            type_to_global_type(out_t),
+            type_sub_len(out_t),
+            output,
+            type_to_global_type(out_t),
+            type_sub_len(out_t),
+            name,
+        );
+        test.push_str(&t);
+    }
+    test.push_str("    }\n");
+    test
+}

 fn gen_test(
    name: &str,
@ -1305,9 +1492,8 @@ fn gen_arm(
    target: TargetFeature,
    fixed: &Vec<String>,
    multi_fn: &Vec<String>,
+    test_fn: &str,
 ) -> (String, String) {
-    let _global_t = type_to_global_type(in_t[0]);
-    let _global_ret_t = type_to_global_type(out_t);
    let name = match suffix {
        Normal => format!("{}{}", current_name, type_to_suffix(in_t[1])),
        NoQ => format!("{}{}", current_name, type_to_noq_suffix(in_t[1])),
@ -1440,7 +1626,7 @@ fn gen_arm(
        if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] {
            ext_c = format!(
                r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "arm", link_name = "{}")]
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
@ -1476,7 +1662,7 @@ fn gen_arm(
            };
            ext_c_arm.push_str(&format!(
                r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "arm", link_name = "{}")]
        fn {}({}) -> {};
    }}
@ -1504,7 +1690,7 @@ fn gen_arm(
        if out_t != link_arm_t[3] {
            ext_c_arm.push_str(&format!(
                r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "arm", link_name = "{}")]
        fn {}({}) -> {};
    }}
@ -1532,7 +1718,7 @@ fn gen_arm(
        if const_aarch64.is_some() {
            ext_c_aarch64.push_str(&format!(
                r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
@ -1557,7 +1743,7 @@ fn gen_arm(
        if out_t != link_aarch64_t[3] {
            ext_c_aarch64.push_str(&format!(
                r#"#[allow(improper_ctypes)]
-    extern "C" {{
+    extern "unadjusted" {{
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
@ -1880,15 +2066,19 @@ fn gen_arm(
            call,
        )
    };
-    let test = gen_test(
-        &name,
-        in_t,
-        &out_t,
-        current_tests,
-        [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
-        type_len(out_t),
-        para_num,
-    );
+    let test = if test_fn == "load_test" {
+        gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t))
+    } else {
+        gen_test(
+            &name,
+            in_t,
+            &out_t,
+            current_tests,
+            [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
+            type_len(out_t),
+            para_num,
+        )
+    };

    (function, test)
 }
@ -2305,7 +2495,9 @@ fn get_call(
        } else if fn_format[1] == "in2lane" {
            fn_name.push_str(&type_to_lane_suffixes(out_t, in_t[2]));
        } else if fn_format[1] == "signed" {
-            fn_name.push_str(type_to_suffix(type_to_signed(in_t[1])));
+            fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(in_t[1]))));
+        } else if fn_format[1] == "outsigned" {
+            fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(out_t))));
        } else if fn_format[1] == "unsigned" {
            fn_name.push_str(type_to_suffix(type_to_unsigned(in_t[1])));
        } else if fn_format[1] == "doubleself" {
@ -2315,7 +2507,7 @@ fn get_call(
        } else if fn_format[1] == "noqself" {
            fn_name.push_str(type_to_noq_suffix(in_t[1]));
        } else if fn_format[1] == "noqsigned" {
-            fn_name.push_str(type_to_noq_suffix(type_to_signed(in_t[1])));
+            fn_name.push_str(type_to_noq_suffix(&type_to_signed(&String::from(in_t[1]))));
        } else if fn_format[1] == "nosuffix" {
        } else if fn_format[1] == "in_len" {
            fn_name.push_str(&type_len(in_t[1]).to_string());
@ -2330,7 +2522,7 @@ fn get_call(
        } else if fn_format[1] == "nin0" {
            fn_name.push_str(type_to_n_suffix(in_t[0]));
        } else if fn_format[1] == "nsigned" {
-            fn_name.push_str(type_to_n_suffix(type_to_signed(in_t[1])));
+            fn_name.push_str(type_to_n_suffix(&type_to_signed(&String::from(in_t[1]))));
        } else if fn_format[1] == "in_ntt" {
            fn_name.push_str(type_to_suffix(native_type_to_type(in_t[1])));
        } else if fn_format[1] == "out_ntt" {
@ -2410,6 +2602,7 @@ fn main() -> io::Result<()> {
    )> = Vec::new();
    let mut multi_fn: Vec<String> = Vec::new();
    let mut target: TargetFeature = Default;
+    let mut test_fn = "normal";

    //
    // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY
@ -2491,6 +2684,7 @@ mod test {
            n = None;
            multi_fn = Vec::new();
            target = Default;
+            test_fn = "normal";
        } else if line.starts_with("//") {
        } else if line.starts_with("name = ") {
            current_name = Some(String::from(&line[7..]));
@ -2547,6 +2741,14 @@ mod test {
            link_arm = Some(String::from(&line[11..]));
        } else if line.starts_with("const-arm = ") {
            const_arm = Some(String::from(&line[12..]));
+        } else if line.starts_with("test = ") {
+            test_fn = if line.contains("load_test") {
+                "load_test"
+            } else if line.contains("store_test") {
+                "store_test"
+            } else {
+                "normal"
+            }
        } else if line.starts_with("target = ") {
            target = match Some(String::from(&line[9..])) {
                Some(input) => match input.as_str() {
@ -2618,6 +2820,7 @@ mod test {
                        target,
                        &fixed,
                        &multi_fn,
+                        test_fn,
                    );
                    out_arm.push_str(&function);
                    tests_arm.push_str(&test);
@ -2638,6 +2841,7 @@ mod test {
                        target,
                        &fixed,
                        &multi_fn,
+                        test_fn,
                    );
                    out_aarch64.push_str(&function);
                    tests_aarch64.push_str(&test);
--- a/library/stdarch/crates/stdarch-verify/src/lib.rs
+++ b/library/stdarch/crates/stdarch-verify/src/lib.rs
@ -218,11 +218,29 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
            "int8x16_t" => quote! { &I8X16 },
            "int16x2_t" => quote! { &I16X2 },
            "int16x4_t" => quote! { &I16X4 },
+            "int16x4x2_t" => quote! { &I16X4X2 },
+            "int16x4x3_t" => quote! { &I16x4x3 },
+            "int16x4x4_t" => quote! { &I16x4x4 },
            "int16x8_t" => quote! { &I16X8 },
+            "int16x8x2_t" => quote! { &I16X8X2 },
+            "int16x8x3_t" => quote! { &I16x8x3 },
+            "int16x8x4_t" => quote! { &I16x8x4 },
            "int32x2_t" => quote! { &I32X2 },
+            "int32x2x2_t" => quote! { &I32X2X2 },
+            "int32x2x3_t" => quote! { &I32X2X3 },
+            "int32x2x4_t" => quote! { &I32X2X4 },
            "int32x4_t" => quote! { &I32X4 },
+            "int32x4x2_t" => quote! { &I32X4X2 },
+            "int32x4x3_t" => quote! { &I32X4X3 },
+            "int32x4x4_t" => quote! { &I32X4X4 },
            "int64x1_t" => quote! { &I64X1 },
+            "int64x1x2_t" => quote! { &I64X1X2 },
+            "int64x1x3_t" => quote! { &I64X1X3 },
+            "int64x1x4_t" => quote! { &I64X1X4 },
            "int64x2_t" => quote! { &I64X2 },
+            "int64x2x2_t" => quote! { &I64X2X2 },
+            "int64x2x3_t" => quote! { &I64X2X3 },
+            "int64x2x4_t" => quote! { &I64X2X4 },
            "uint8x8_t" => quote! { &U8X8 },
            "uint8x4_t" => quote! { &U8X4 },
            "uint8x8x2_t" => quote! { &U8X8X2 },
@ -233,15 +251,45 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
            "uint8x8x4_t" => quote! { &U8X8X4 },
            "uint8x16_t" => quote! { &U8X16 },
            "uint16x4_t" => quote! { &U16X4 },
+            "uint16x4x2_t" => quote! { &U16X4X2 },
+            "uint16x4x3_t" => quote! { &U16x4x3 },
+            "uint16x4x4_t" => quote! { &U16x4x4 },
            "uint16x8_t" => quote! { &U16X8 },
+            "uint16x8x2_t" => quote! { &U16X8X2 },
+            "uint16x8x3_t" => quote! { &U16x8x3 },
+            "uint16x8x4_t" => quote! { &U16x8x4 },
            "uint32x2_t" => quote! { &U32X2 },
+            "uint32x2x2_t" => quote! { &U32X2X2 },
+            "uint32x2x3_t" => quote! { &U32X2X3 },
+            "uint32x2x4_t" => quote! { &U32X2X4 },
            "uint32x4_t" => quote! { &U32X4 },
+            "uint32x4x2_t" => quote! { &U32X4X2 },
+            "uint32x4x3_t" => quote! { &U32X4X3 },
+            "uint32x4x4_t" => quote! { &U32X4X4 },
            "uint64x1_t" => quote! { &U64X1 },
+            "uint64x1x2_t" => quote! { &U64X1X2 },
+            "uint64x1x3_t" => quote! { &U64X1X3 },
+            "uint64x1x4_t" => quote! { &U64X1X4 },
            "uint64x2_t" => quote! { &U64X2 },
+            "uint64x2x2_t" => quote! { &U64X2X2 },
+            "uint64x2x3_t" => quote! { &U64X2X3 },
+            "uint64x2x4_t" => quote! { &U64X2X4 },
            "float32x2_t" => quote! { &F32X2 },
+            "float32x2x2_t" => quote! { &F32X2X2 },
+            "float32x2x3_t" => quote! { &F32X2X3 },
+            "float32x2x4_t" => quote! { &F32X2X4 },
            "float32x4_t" => quote! { &F32X4 },
+            "float32x4x2_t" => quote! { &F32X4X2 },
+            "float32x4x3_t" => quote! { &F32X4X3 },
+            "float32x4x4_t" => quote! { &F32X4X4 },
            "float64x1_t" => quote! { &F64X1 },
+            "float64x1x2_t" => quote! { &F64X1X2 },
+            "float64x1x3_t" => quote! { &F64X1X3 },
+            "float64x1x4_t" => quote! { &F64X1X4 },
            "float64x2_t" => quote! { &F64X2 },
+            "float64x2x2_t" => quote! { &F64X2X2 },
+            "float64x2x3_t" => quote! { &F64X2X3 },
+            "float64x2x4_t" => quote! { &F64X2X4 },
            "poly8x8_t" => quote! { &POLY8X8 },
            "poly8x8x2_t" => quote! { &POLY8X8X2 },
            "poly8x8x3_t" => quote! { &POLY8X8X3 },
@ -254,7 +302,13 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
            "poly64x2_t" => quote! { &POLY64X2 },
            "poly8x16_t" => quote! { &POLY8X16 },
            "poly16x4_t" => quote! { &POLY16X4 },
+            "poly16x4x2_t" => quote! { &POLY16X4X2 },
+            "poly16x4x3_t" => quote! { &POLY16X4X3 },
+            "poly16x4x4_t" => quote! { &POLY16X4X4 },
            "poly16x8_t" => quote! { &POLY16X8 },
+            "poly16x8x2_t" => quote! { &POLY16X8X2 },
+            "poly16x8x3_t" => quote! { &POLY16X8X3 },
+            "poly16x8x4_t" => quote! { &POLY16X8X4 },
            "p128" => quote! { &P128 },

            "v16i8" => quote! { &v16i8 },