From c6ad1cf58b1ad4d454562e52c8fb8d77063f6550 Mon Sep 17 00:00:00 2001
From: Adam Hillier <7688302+AdamHillier@users.noreply.github.com>
Date: Fri, 6 Nov 2020 21:56:51 +0000
Subject: [PATCH] Add more addition Arm intrinsics (#939)

---
 .../crates/core_arch/src/aarch64/neon/mod.rs  |  312 +++
 .../crates/core_arch/src/arm/neon/mod.rs      | 1690 ++++++++++++++++-
 library/stdarch/crates/stdarch-gen/neon.spec  |   35 +-
 3 files changed, 2004 insertions(+), 33 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
index c37a5fcab4a8..0643e240fd2e 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@@ -79,6 +79,40 @@ extern "C" {
     #[link_name = "llvm.aarch64.neon.abs.v2i64"]
     fn vabsq_s64_(a: int64x2_t) -> int64x2_t;
 
+    #[link_name = "llvm.aarch64.neon.suqadd.v8i8"]
+    fn vuqadd_s8_(a: int8x8_t, b: uint8x8_t) -> int8x8_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v16i8"]
+    fn vuqaddq_s8_(a: int8x16_t, b: uint8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v4i16"]
+    fn vuqadd_s16_(a: int16x4_t, b: uint16x4_t) -> int16x4_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v8i16"]
+    fn vuqaddq_s16_(a: int16x8_t, b: uint16x8_t) -> int16x8_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v2i32"]
+    fn vuqadd_s32_(a: int32x2_t, b: uint32x2_t) -> int32x2_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v4i32"]
+    fn vuqaddq_s32_(a: int32x4_t, b: uint32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v1i64"]
+    fn vuqadd_s64_(a: int64x1_t, b: uint64x1_t) -> int64x1_t;
+    #[link_name = "llvm.aarch64.neon.suqadd.v2i64"]
+    fn vuqaddq_s64_(a: int64x2_t, b: uint64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.aarch64.neon.usqadd.v8i8"]
+    fn vsqadd_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v16i8"]
+    fn vsqaddq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v4i16"]
+    fn vsqadd_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v8i16"]
+    fn vsqaddq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v2i32"]
+    fn vsqadd_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v4i32"]
+    fn vsqaddq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v1i64"]
+    fn vsqadd_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    #[link_name = "llvm.aarch64.neon.usqadd.v2i64"]
+    fn vsqaddq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+
     #[link_name = "llvm.aarch64.neon.pmull64"]
     fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
 
@@ -348,6 +382,120 @@ pub unsafe fn vabsq_s64(a: int64x2_t) -> int64x2_t {
     vabsq_s64_(a)
 }
 
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqadd_s8(a: int8x8_t, b: uint8x8_t) -> int8x8_t {
+    vuqadd_s8_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqaddq_s8(a: int8x16_t, b: uint8x16_t) -> int8x16_t {
+    vuqaddq_s8_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqadd_s16(a: int16x4_t, b: uint16x4_t) -> int16x4_t {
+    vuqadd_s16_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqaddq_s16(a: int16x8_t, b: uint16x8_t) -> int16x8_t {
+    vuqaddq_s16_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqadd_s32(a: int32x2_t, b: uint32x2_t) -> int32x2_t {
+    vuqadd_s32_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqaddq_s32(a: int32x4_t, b: uint32x4_t) -> int32x4_t {
+    vuqaddq_s32_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqadd_s64(a: int64x1_t, b: uint64x1_t) -> int64x1_t {
+    vuqadd_s64_(a, b)
+}
+/// Signed saturating Accumulate of Unsigned value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(suqadd))]
+pub unsafe fn vuqaddq_s64(a: int64x2_t, b: uint64x2_t) -> int64x2_t {
+    vuqaddq_s64_(a, b)
+}
+
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqadd_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    vsqadd_u8_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqaddq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    vsqaddq_u8_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqadd_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    vsqadd_u16_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqaddq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    vsqaddq_u16_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqadd_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    vsqadd_u32_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqaddq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    vsqaddq_u32_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqadd_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    vsqadd_u64_(a, b)
+}
+/// Unsigned saturating Accumulate of Signed value.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(usqadd))]
+pub unsafe fn vsqaddq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    vsqaddq_u64_(a, b)
+}
+
 /// Add pairwise
 #[inline]
 #[target_feature(enable = "neon")]
@@ -2328,6 +2476,170 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s8() {
+        let a = i8x8::new(i8::MIN, -3, -2, -1, 0, 1, 2, i8::MAX);
+        let b = u8x8::new(u8::MAX, 1, 2, 3, 4, 5, 6, 7);
+        let e = i8x8::new(i8::MAX, -2, 0, 2, 4, 6, 8, i8::MAX);
+        let r: i8x8 = transmute(vuqadd_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s8() {
+        let a = i8x16::new(
+            i8::MIN,
+            -7,
+            -6,
+            -5,
+            -4,
+            -3,
+            -2,
+            -1,
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            i8::MAX,
+        );
+        let b = u8x16::new(u8::MAX, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e = i8x16::new(
+            i8::MAX,
+            -6,
+            -4,
+            -2,
+            0,
+            2,
+            4,
+            6,
+            8,
+            10,
+            12,
+            14,
+            16,
+            18,
+            20,
+            i8::MAX,
+        );
+        let r: i8x16 = transmute(vuqaddq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s16() {
+        let a = i16x4::new(i16::MIN, -1, 0, i16::MAX);
+        let b = u16x4::new(u16::MAX, 1, 2, 3);
+        let e = i16x4::new(i16::MAX, 0, 2, i16::MAX);
+        let r: i16x4 = transmute(vuqadd_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s16() {
+        let a = i16x8::new(i16::MIN, -3, -2, -1, 0, 1, 2, i16::MAX);
+        let b = u16x8::new(u16::MAX, 1, 2, 3, 4, 5, 6, 7);
+        let e = i16x8::new(i16::MAX, -2, 0, 2, 4, 6, 8, i16::MAX);
+        let r: i16x8 = transmute(vuqaddq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s32() {
+        let a = i32x2::new(i32::MIN, i32::MAX);
+        let b = u32x2::new(u32::MAX, 1);
+        let e = i32x2::new(i32::MAX, i32::MAX);
+        let r: i32x2 = transmute(vuqadd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s32() {
+        let a = i32x4::new(i32::MIN, -1, 0, i32::MAX);
+        let b = u32x4::new(u32::MAX, 1, 2, 3);
+        let e = i32x4::new(i32::MAX, 0, 2, i32::MAX);
+        let r: i32x4 = transmute(vuqaddq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqadd_s64() {
+        let a = i64x1::new(i64::MIN);
+        let b = u64x1::new(u64::MAX);
+        let e = i64x1::new(i64::MAX);
+        let r: i64x1 = transmute(vuqadd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vuqaddq_s64() {
+        let a = i64x2::new(i64::MIN, i64::MAX);
+        let b = u64x2::new(u64::MAX, 1);
+        let e = i64x2::new(i64::MAX, i64::MAX);
+        let r: i64x2 = transmute(vuqaddq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, u8::MAX);
+        let b = i8x8::new(i8::MIN, -3, -2, -1, 0, 1, 2, 3);
+        let e = u8x8::new(0, 0, 0, 2, 4, 6, 8, u8::MAX);
+        let r: u8x8 = transmute(vsqadd_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, u8::MAX);
+        let b = i8x16::new(i8::MIN, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7);
+        let e = u8x16::new(0, 0, 0, 0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, u8::MAX);
+        let r: u8x16 = transmute(vsqaddq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u16() {
+        let a = u16x4::new(0, 1, 2, u16::MAX);
+        let b = i16x4::new(i16::MIN, -1, 0, 1);
+        let e = u16x4::new(0, 0, 2, u16::MAX);
+        let r: u16x4 = transmute(vsqadd_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, u16::MAX);
+        let b = i16x8::new(i16::MIN, -3, -2, -1, 0, 1, 2, 3);
+        let e = u16x8::new(0, 0, 0, 2, 4, 6, 8, u16::MAX);
+        let r: u16x8 = transmute(vsqaddq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u32() {
+        let a = u32x2::new(0, u32::MAX);
+        let b = i32x2::new(i32::MIN, 1);
+        let e = u32x2::new(0, u32::MAX);
+        let r: u32x2 = transmute(vsqadd_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u32() {
+        let a = u32x4::new(0, 1, 2, u32::MAX);
+        let b = i32x4::new(i32::MIN, -1, 0, 1);
+        let e = u32x4::new(0, 0, 2, u32::MAX);
+        let r: u32x4 = transmute(vsqaddq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqadd_u64() {
+        let a = u64x1::new(0);
+        let b = i64x1::new(i64::MIN);
+        let e = u64x1::new(0);
+        let r: u64x1 = transmute(vsqadd_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqaddq_u64() {
+        let a = u64x2::new(0, u64::MAX);
+        let b = i64x2::new(i64::MIN, 1);
+        let e = u64x2::new(0, u64::MAX);
+        let r: u64x2 = transmute(vsqaddq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vpaddq_s16() {
         let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
diff --git a/library/stdarch/crates/core_arch/src/arm/neon/mod.rs b/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
index 0484beedfcb3..bc67bc1a6d04 100644
--- a/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
@@ -176,6 +176,16 @@ extern "C" {
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxp.v2f32")]
     fn vpmaxf_v2f32(a: float32x2_t, b: float32x2_t) -> float32x2_t;
 
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v8i8")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.raddhn.v8i8")]
+    fn vraddhn_s16_(a: int16x8_t, b: int16x8_t) -> int8x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v4i16")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.raddhn.v4i16")]
+    fn vraddhn_s32_(a: int32x4_t, b: int32x4_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vraddhn.v2i32")]
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.raddhn.v2i32")]
+    fn vraddhn_s64_(a: int64x2_t, b: int64x2_t) -> int32x2_t;
+
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpadd.v4i16")]
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.addp.v4i16")]
     fn vpadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
@@ -189,6 +199,80 @@ extern "C" {
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.addp.v16i8")]
     fn vpaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
 
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i16.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v4i16.v8i8"
+    )]
+    fn vpaddl_s8_(a: int8x8_t) -> int16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i32.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v2i32.v4i16"
+    )]
+    fn vpaddl_s16_(a: int16x4_t) -> int32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v1i64.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v1i64.v2i32"
+    )]
+    fn vpaddl_s32_(a: int32x2_t) -> int64x1_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v8i16.v16i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v8i16.v16i8"
+    )]
+    fn vpaddlq_s8_(a: int8x16_t) -> int16x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i32.v8i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v4i32.v8i16"
+    )]
+    fn vpaddlq_s16_(a: int16x8_t) -> int32x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i64.v4i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.saddlp.v2i64.v4i32"
+    )]
+    fn vpaddlq_s32_(a: int32x4_t) -> int64x2_t;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i16.v8i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v4i16.v8i8"
+    )]
+    fn vpaddl_u8_(a: uint8x8_t) -> uint16x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i32.v4i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v2i32.v4i16"
+    )]
+    fn vpaddl_u16_(a: uint16x4_t) -> uint32x2_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v1i64.v2i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v1i64.v2i32"
+    )]
+    fn vpaddl_u32_(a: uint32x2_t) -> uint64x1_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v8i16.v16i8")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v8i16.v16i8"
+    )]
+    fn vpaddlq_u8_(a: uint8x16_t) -> uint16x8_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i32.v8i16")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v4i32.v8i16"
+    )]
+    fn vpaddlq_u16_(a: uint16x8_t) -> uint32x4_t;
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i64.v4i32")]
+    #[cfg_attr(
+        target_arch = "aarch64",
+        link_name = "llvm.aarch64.neon.uaddlp.v2i64.v4i32"
+    )]
+    fn vpaddlq_u32_(a: uint32x4_t) -> uint64x2_t;
+
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v4f32")]
     fn vmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
@@ -207,6 +291,32 @@ extern "C" {
 #[cfg(target_arch = "arm")]
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.arm.neon.vpadals.v4i16.v8i8"]
+    fn vpadal_s8_(a: int16x4_t, b: int8x8_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i32.v4i16"]
+    fn vpadal_s16_(a: int32x2_t, b: int16x4_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vpadals.v1i64.v2i32"]
+    fn vpadal_s32_(a: int64x1_t, b: int32x2_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vpadals.v8i16.v16i8"]
+    fn vpadalq_s8_(a: int16x8_t, b: int8x16_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i32.v8i16"]
+    fn vpadalq_s16_(a: int32x4_t, b: int16x8_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i64.v4i32"]
+    fn vpadalq_s32_(a: int64x2_t, b: int32x4_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8"]
+    fn vpadal_u8_(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16"]
+    fn vpadal_u16_(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32"]
+    fn vpadal_u32_(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8"]
+    fn vpadalq_u8_(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16"]
+    fn vpadalq_u16_(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32"]
+    fn vpadalq_u32_(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t;
+
     #[link_name = "llvm.arm.neon.vtbl1"]
     fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     #[link_name = "llvm.arm.neon.vtbl2"]
@@ -534,7 +644,7 @@ pub unsafe fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     simd_add(a, b)
 }
 
-/// Vector long add.
+/// Signed Add Long (vector).
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -546,7 +656,7 @@ pub unsafe fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
     simd_add(a, b)
 }
 
-/// Vector long add.
+/// Signed Add Long (vector).
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -558,7 +668,7 @@ pub unsafe fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
     simd_add(a, b)
 }
 
-/// Vector long add.
+/// Signed Add Long (vector).
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -570,7 +680,7 @@ pub unsafe fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
     simd_add(a, b)
 }
 
-/// Vector long add.
+/// Unsigned Add Long (vector).
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -582,7 +692,7 @@ pub unsafe fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
     simd_add(a, b)
 }
 
-/// Vector long add.
+/// Unsigned Add Long (vector).
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -594,7 +704,7 @@ pub unsafe fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
     simd_add(a, b)
 }
 
-/// Vector long add.
+/// Unsigned Add Long (vector).
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -606,6 +716,804 @@ pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
     simd_add(a, b)
 }
 
+/// Signed Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
+pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
+pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
+pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
+pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
+pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Long (vector, high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
+pub unsafe fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw))]
+pub unsafe fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw))]
+pub unsafe fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw))]
+pub unsafe fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw))]
+pub unsafe fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw))]
+pub unsafe fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw))]
+pub unsafe fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
+pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
+pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Signed Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
+pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
+pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
+pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Unsigned Add Wide (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
+pub unsafe fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+pub unsafe fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+pub unsafe fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+pub unsafe fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+pub unsafe fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+pub unsafe fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)))
+}
+
+/// Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn))]
+pub unsafe fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)))
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
+    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)));
+    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)));
+    simd_shuffle4(r, x, [0, 1, 2, 3])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
+    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)));
+    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
+pub unsafe fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)));
+    simd_shuffle4(r, x, [0, 1, 2, 3])
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+pub unsafe fn vraddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    vraddhn_s16_(a, b)
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+pub unsafe fn vraddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    vraddhn_s32_(a, b)
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+pub unsafe fn vraddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    vraddhn_s64_(a, b)
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+pub unsafe fn vraddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    transmute(vraddhn_s16_(transmute(a), transmute(b)))
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+pub unsafe fn vraddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    transmute(vraddhn_s32_(transmute(a), transmute(b)))
+}
+
+/// Rounding Add returning High Narrow.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn))]
+pub unsafe fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    transmute(vraddhn_s64_(transmute(a), transmute(b)))
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = vraddhn_s16_(a, b);
+    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = vraddhn_s32_(a, b);
+    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = vraddhn_s64_(a, b);
+    simd_shuffle4(r, x, [0, 1, 2, 3])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x: uint8x8_t = transmute(vraddhn_s16_(transmute(a), transmute(b)));
+    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x: uint16x4_t = transmute(vraddhn_s32_(transmute(a), transmute(b)));
+    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Rounding Add returning High Narrow (high half).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vraddhn.i64))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
+pub unsafe fn vraddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x: uint32x2_t = transmute(vraddhn_s64_(transmute(a), transmute(b)));
+    simd_shuffle4(r, x, [0, 1, 2, 3])
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+pub unsafe fn vpaddl_s8(a: int8x8_t) -> int16x4_t {
+    vpaddl_s8_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+pub unsafe fn vpaddl_s16(a: int16x4_t) -> int32x2_t {
+    vpaddl_s16_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+pub unsafe fn vpaddl_s32(a: int32x2_t) -> int64x1_t {
+    vpaddl_s32_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+pub unsafe fn vpaddlq_s8(a: int8x16_t) -> int16x8_t {
+    vpaddlq_s8_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+pub unsafe fn vpaddlq_s16(a: int16x8_t) -> int32x4_t {
+    vpaddlq_s16_(a)
+}
+
+/// Signed Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddlp))]
+pub unsafe fn vpaddlq_s32(a: int32x4_t) -> int64x2_t {
+    vpaddlq_s32_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+pub unsafe fn vpaddl_u8(a: uint8x8_t) -> uint16x4_t {
+    vpaddl_u8_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+pub unsafe fn vpaddl_u16(a: uint16x4_t) -> uint32x2_t {
+    vpaddl_u16_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+pub unsafe fn vpaddl_u32(a: uint32x2_t) -> uint64x1_t {
+    vpaddl_u32_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+pub unsafe fn vpaddlq_u8(a: uint8x16_t) -> uint16x8_t {
+    vpaddlq_u8_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+pub unsafe fn vpaddlq_u16(a: uint16x8_t) -> uint32x4_t {
+    vpaddlq_u16_(a)
+}
+
+/// Unsigned Add Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpaddl.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddlp))]
+pub unsafe fn vpaddlq_u32(a: uint32x4_t) -> uint64x2_t {
+    vpaddlq_u32_(a)
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadal_s8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s8_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadal_s16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s16_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadal_s32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s32_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadalq_s8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s8_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadalq_s16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s16_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadalq_s32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s32_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadal_u8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u8_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadal_u16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u16_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadal_u32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u32_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadalq_u8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u8_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadalq_u16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u16_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        vpadalq_u32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u32_(b), a)
+    }
+}
+
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
@@ -2834,6 +3742,776 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let x = i8::MAX;
+        let b = i8x16::new(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
+        let x = x as i16;
+        let e = i16x8::new(x + 8, x + 9, x + 10, x + 11, x + 12, x + 13, x + 14, x + 15);
+        let r: i16x8 = transmute(vaddl_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let x = i16::MAX;
+        let b = i16x8::new(x, x, x, x, x, x, x, x);
+        let x = x as i32;
+        let e = i32x4::new(x + 4, x + 5, x + 6, x + 7);
+        let r: i32x4 = transmute(vaddl_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let x = i32::MAX;
+        let b = i32x4::new(x, x, x, x);
+        let x = x as i64;
+        let e = i64x2::new(x + 2, x + 3);
+        let r: i64x2 = transmute(vaddl_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let x = u8::MAX;
+        let b = u8x16::new(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
+        let x = x as u16;
+        let e = u16x8::new(x + 8, x + 9, x + 10, x + 11, x + 12, x + 13, x + 14, x + 15);
+        let r: u16x8 = transmute(vaddl_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let x = u16::MAX;
+        let b = u16x8::new(x, x, x, x, x, x, x, x);
+        let x = x as u32;
+        let e = u32x4::new(x + 4, x + 5, x + 6, x + 7);
+        let r: u32x4 = transmute(vaddl_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddl_high_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let x = u32::MAX;
+        let b = u32x4::new(x, x, x, x);
+        let x = x as u64;
+        let e = u64x2::new(x + 2, x + 3);
+        let r: u64x2 = transmute(vaddl_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s8() {
+        let x = i16::MAX;
+        let a = i16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = i8::MAX;
+        let b = i8x8::new(y, y, y, y, y, y, y, y);
+        let y = y as i16;
+        let e = i16x8::new(x + y, 1 + y, 2 + y, 3 + y, 4 + y, 5 + y, 6 + y, 7 + y);
+        let r: i16x8 = transmute(vaddw_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s16() {
+        let x = i32::MAX;
+        let a = i32x4::new(x, 1, 2, 3);
+        let y = i16::MAX;
+        let b = i16x4::new(y, y, y, y);
+        let y = y as i32;
+        let e = i32x4::new(x + y, 1 + y, 2 + y, 3 + y);
+        let r: i32x4 = transmute(vaddw_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_s32() {
+        let x = i64::MAX;
+        let a = i64x2::new(x, 1);
+        let y = i32::MAX;
+        let b = i32x2::new(y, y);
+        let y = y as i64;
+        let e = i64x2::new(x + y, 1 + y);
+        let r: i64x2 = transmute(vaddw_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u8() {
+        let x = u16::MAX;
+        let a = u16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = u8::MAX;
+        let b = u8x8::new(y, y, y, y, y, y, y, y);
+        let y = y as u16;
+        let e = u16x8::new(x + y, 1 + y, 2 + y, 3 + y, 4 + y, 5 + y, 6 + y, 7 + y);
+        let r: u16x8 = transmute(vaddw_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u16() {
+        let x = u32::MAX;
+        let a = u32x4::new(x, 1, 2, 3);
+        let y = u16::MAX;
+        let b = u16x4::new(y, y, y, y);
+        let y = y as u32;
+        let e = u32x4::new(x + y, 1 + y, 2 + y, 3 + y);
+        let r: u32x4 = transmute(vaddw_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_u32() {
+        let x = u64::MAX;
+        let a = u64x2::new(x, 1);
+        let y = u32::MAX;
+        let b = u32x2::new(y, y);
+        let y = y as u64;
+        let e = u64x2::new(x + y, 1 + y);
+        let r: u64x2 = transmute(vaddw_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s8() {
+        let x = i16::MAX;
+        let a = i16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = i8::MAX;
+        let b = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, y, y, y, y, y, y, y, y);
+        let y = y as i16;
+        let e = i16x8::new(x + y, 1 + y, 2 + y, 3 + y, 4 + y, 5 + y, 6 + y, 7 + y);
+        let r: i16x8 = transmute(vaddw_high_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s16() {
+        let x = i32::MAX;
+        let a = i32x4::new(x, 1, 2, 3);
+        let y = i16::MAX;
+        let b = i16x8::new(0, 0, 0, 0, y, y, y, y);
+        let y = y as i32;
+        let e = i32x4::new(x + y, 1 + y, 2 + y, 3 + y);
+        let r: i32x4 = transmute(vaddw_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_s32() {
+        let x = i64::MAX;
+        let a = i64x2::new(x, 1);
+        let y = i32::MAX;
+        let b = i32x4::new(0, 0, y, y);
+        let y = y as i64;
+        let e = i64x2::new(x + y, 1 + y);
+        let r: i64x2 = transmute(vaddw_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u8() {
+        let x = u16::MAX;
+        let a = u16x8::new(x, 1, 2, 3, 4, 5, 6, 7);
+        let y = u8::MAX;
+        let b = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, y, y, y, y, y, y, y, y);
+        let y = y as u16;
+        let e = u16x8::new(x + y, 1 + y, 2 + y, 3 + y, 4 + y, 5 + y, 6 + y, 7 + y);
+        let r: u16x8 = transmute(vaddw_high_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u16() {
+        let x = u32::MAX;
+        let a = u32x4::new(x, 1, 2, 3);
+        let y = u16::MAX;
+        let b = u16x8::new(0, 0, 0, 0, y, y, y, y);
+        let y = y as u32;
+        let e = u32x4::new(x + y, 1 + y, 2 + y, 3 + y);
+        let r: u32x4 = transmute(vaddw_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddw_high_u32() {
+        let x = u64::MAX;
+        let a = u64x2::new(x, 1);
+        let y = u32::MAX;
+        let b = u32x4::new(0, 0, y, y);
+        let y = y as u64;
+        let e = u64x2::new(x + y, 1 + y);
+        let r: u64x2 = transmute(vaddw_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_s16() {
+        let a = i16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let r: i8x8 = transmute(vaddhn_s16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_s32() {
+        let a = i32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = i16x4::new(0, 2, 4, 6);
+        let r: i16x4 = transmute(vaddhn_s32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_s64() {
+        let a = i64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = i32x2::new(0, 2);
+        let r: i32x2 = transmute(vaddhn_s64(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_u16() {
+        let a = u16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let r: u8x8 = transmute(vaddhn_u16(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_u32() {
+        let a = u32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = u16x4::new(0, 2, 4, 6);
+        let r: u16x4 = transmute(vaddhn_u32(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_u64() {
+        let a = u64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = u32x2::new(0, 2);
+        let r: u32x2 = transmute(vaddhn_u64(transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_s16() {
+        let r = i8x8::splat(42);
+        let a = i16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 2, 4, 6, 8, 10, 12, 14);
+        let r: i8x16 = transmute(vaddhn_high_s16(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_s32() {
+        let r = i16x4::splat(42);
+        let a = i32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = i16x8::new(42, 42, 42, 42, 0, 2, 4, 6);
+        let r: i16x8 = transmute(vaddhn_high_s32(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_s64() {
+        let r = i32x2::splat(42);
+        let a = i64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = i32x4::new(42, 42, 0, 2);
+        let r: i32x4 = transmute(vaddhn_high_s64(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_u16() {
+        let r = u8x8::splat(42);
+        let a = u16x8::new(
+            (0 << 8) + 1,
+            (1 << 8) + 1,
+            (2 << 8) + 1,
+            (3 << 8) + 1,
+            (4 << 8) + 1,
+            (5 << 8) + 1,
+            (6 << 8) + 1,
+            (7 << 8) + 1,
+        );
+        let e = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 2, 4, 6, 8, 10, 12, 14);
+        let r: u8x16 = transmute(vaddhn_high_u16(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_u32() {
+        let r = u16x4::splat(42);
+        let a = u32x4::new((0 << 16) + 1, (1 << 16) + 1, (2 << 16) + 1, (3 << 16) + 1);
+        let e = u16x8::new(42, 42, 42, 42, 0, 2, 4, 6);
+        let r: u16x8 = transmute(vaddhn_high_u32(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vaddhn_high_u64() {
+        let r = u32x2::splat(42);
+        let a = u64x2::new((0 << 32) + 1, (1 << 32) + 1);
+        let e = u32x4::new(42, 42, 0, 2);
+        let r: u32x4 = transmute(vaddhn_high_u64(transmute(r), transmute(a), transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_s16() {
+        let round_constant: i16 = (1 << 8) - 1;
+        let a = i16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = i16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = i8x8::new(0, 3, 4, 7, 8, 11, 12, 15);
+        let r: i8x8 = transmute(vraddhn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_s32() {
+        let round_constant: i32 = (1 << 16) - 1;
+        let a = i32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = i32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = i16x4::new(0, 3, 4, 7);
+        let r: i16x4 = transmute(vraddhn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_s64() {
+        let round_constant: i64 = (1 << 32) - 1;
+        let a = i64x2::new(0 << 32, 1 << 32);
+        let b = i64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = i32x2::new(0, 3);
+        let r: i32x2 = transmute(vraddhn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_u16() {
+        let round_constant: u16 = (1 << 8) - 1;
+        let a = u16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = u16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = u8x8::new(0, 3, 4, 7, 8, 11, 12, 15);
+        let r: u8x8 = transmute(vraddhn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_u32() {
+        let round_constant: u32 = (1 << 16) - 1;
+        let a = u32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = u32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = u16x4::new(0, 3, 4, 7);
+        let r: u16x4 = transmute(vraddhn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_u64() {
+        let round_constant: u64 = (1 << 32) - 1;
+        let a = u64x2::new(0 << 32, 1 << 32);
+        let b = u64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = u32x2::new(0, 3);
+        let r: u32x2 = transmute(vraddhn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_s16() {
+        let r = i8x8::splat(42);
+        let round_constant: i16 = (1 << 8) - 1;
+        let a = i16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = i16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = i8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 3, 4, 7, 8, 11, 12, 15);
+        let r: i8x16 = transmute(vraddhn_high_s16(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_s32() {
+        let r = i16x4::splat(42);
+        let round_constant: i32 = (1 << 16) - 1;
+        let a = i32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = i32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = i16x8::new(42, 42, 42, 42, 0, 3, 4, 7);
+        let r: i16x8 = transmute(vraddhn_high_s32(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_s64() {
+        let r = i32x2::splat(42);
+        let round_constant: i64 = (1 << 32) - 1;
+        let a = i64x2::new(0 << 32, 1 << 32);
+        let b = i64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = i32x4::new(42, 42, 0, 3);
+        let r: i32x4 = transmute(vraddhn_high_s64(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_u16() {
+        let r = u8x8::splat(42);
+        let round_constant: u16 = (1 << 8) - 1;
+        let a = u16x8::new(
+            0 << 8,
+            1 << 8,
+            2 << 8,
+            3 << 8,
+            4 << 8,
+            5 << 8,
+            6 << 8,
+            7 << 8,
+        );
+        let b = u16x8::new(
+            0 << 8,
+            (1 << 8) + round_constant,
+            2 << 8,
+            (3 << 8) + round_constant,
+            4 << 8,
+            (5 << 8) + round_constant,
+            6 << 8,
+            (7 << 8) + round_constant,
+        );
+        let e = u8x16::new(42, 42, 42, 42, 42, 42, 42, 42, 0, 3, 4, 7, 8, 11, 12, 15);
+        let r: u8x16 = transmute(vraddhn_high_u16(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_u32() {
+        let r = u16x4::splat(42);
+        let round_constant: u32 = (1 << 16) - 1;
+        let a = u32x4::new(0 << 16, 1 << 16, 2 << 16, 3 << 16);
+        let b = u32x4::new(
+            0 << 16,
+            (1 << 16) + round_constant,
+            2 << 16,
+            (3 << 16) + round_constant,
+        );
+        let e = u16x8::new(42, 42, 42, 42, 0, 3, 4, 7);
+        let r: u16x8 = transmute(vraddhn_high_s32(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vraddhn_high_u64() {
+        let r = u32x2::splat(42);
+        let round_constant: u64 = (1 << 32) - 1;
+        let a = u64x2::new(0 << 32, 1 << 32);
+        let b = u64x2::new(0 << 32, (1 << 32) + round_constant);
+        let e = u32x4::new(42, 42, 0, 3);
+        let r: u32x4 = transmute(vraddhn_high_s64(transmute(r), transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_s8() {
+        let a = i8x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i16x4 = transmute(vpaddl_s8(transmute(a)));
+        let e = i16x4::new(-7, -3, 1, 5);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_s16() {
+        let a = i16x4::new(-2, -1, 0, 1);
+        let r: i32x2 = transmute(vpaddl_s16(transmute(a)));
+        let e = i32x2::new(-3, 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_s32() {
+        let a = i32x2::new(-1, 0);
+        let r: i64x1 = transmute(vpaddl_s32(transmute(a)));
+        let e = i64x1::new(-1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_s8() {
+        let a = i8x16::new(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vpaddlq_s8(transmute(a)));
+        let e = i16x8::new(-15, -11, -7, -3, 1, 5, 9, 13);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_s16() {
+        let a = i16x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i32x4 = transmute(vpaddlq_s16(transmute(a)));
+        let e = i32x4::new(-7, -3, 1, 5);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_s32() {
+        let a = i32x4::new(-2, -1, 0, 1);
+        let r: i64x2 = transmute(vpaddlq_s32(transmute(a)));
+        let e = i64x2::new(-3, 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, u8::MAX);
+        let r: u16x4 = transmute(vpaddl_u8(transmute(a)));
+        let e = u16x4::new(1, 5, 9, u8::MAX as u16 + 6);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_u16() {
+        let a = u16x4::new(0, 1, 2, u16::MAX);
+        let r: u32x2 = transmute(vpaddl_u16(transmute(a)));
+        let e = u32x2::new(1, u16::MAX as u32 + 2);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddl_u32() {
+        let a = u32x2::new(1, u32::MAX);
+        let r: u64x1 = transmute(vpaddl_u32(transmute(a)));
+        let e = u64x1::new(u32::MAX as u64 + 1);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, u8::MAX);
+        let r: u16x8 = transmute(vpaddlq_u8(transmute(a)));
+        let e = u16x8::new(1, 5, 9, 13, 17, 21, 25, u8::MAX as u16 + 14);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, u16::MAX);
+        let r: u32x4 = transmute(vpaddlq_u16(transmute(a)));
+        let e = u32x4::new(1, 5, 9, u16::MAX as u32 + 6);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpaddlq_u32() {
+        let a = u32x4::new(0, 1, 2, u32::MAX);
+        let r: u64x2 = transmute(vpaddlq_u32(transmute(a)));
+        let e = u64x2::new(1, u32::MAX as u64 + 2);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_s8() {
+        let a = i16x4::new(42, 42, 42, 42);
+        let b = i8x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i16x4 = transmute(vpadal_s8(transmute(a), transmute(b)));
+        let e = i16x4::new(35, 39, 43, 47);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_s16() {
+        let a = i32x2::new(42, 42);
+        let b = i16x4::new(-2, -1, 0, 1);
+        let r: i32x2 = transmute(vpadal_s16(transmute(a), transmute(b)));
+        let e = i32x2::new(39, 43);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_s32() {
+        let a = i64x1::new(42);
+        let b = i32x2::new(-1, 0);
+        let r: i64x1 = transmute(vpadal_s32(transmute(a), transmute(b)));
+        let e = i64x1::new(41);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_s8() {
+        let a = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b = i8x16::new(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vpadalq_s8(transmute(a), transmute(b)));
+        let e = i16x8::new(27, 31, 35, 39, 43, 47, 51, 55);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_s16() {
+        let a = i32x4::new(42, 42, 42, 42);
+        let b = i16x8::new(-4, -3, -2, -1, 0, 1, 2, 3);
+        let r: i32x4 = transmute(vpadalq_s16(transmute(a), transmute(b)));
+        let e = i32x4::new(35, 39, 43, 47);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_s32() {
+        let a = i64x2::new(42, 42);
+        let b = i32x4::new(-2, -1, 0, 1);
+        let r: i64x2 = transmute(vpadalq_s32(transmute(a), transmute(b)));
+        let e = i64x2::new(39, 43);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_u8() {
+        let a = u16x4::new(42, 42, 42, 42);
+        let b = u8x8::new(0, 1, 2, 3, 4, 5, 6, u8::MAX);
+        let r: u16x4 = transmute(vpadal_u8(transmute(a), transmute(b)));
+        let e = u16x4::new(43, 47, 51, u8::MAX as u16 + 48);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_u16() {
+        let a = u32x2::new(42, 42);
+        let b = u16x4::new(0, 1, 2, u16::MAX);
+        let r: u32x2 = transmute(vpadal_u16(transmute(a), transmute(b)));
+        let e = u32x2::new(43, u16::MAX as u32 + 44);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadal_u32() {
+        let a = u64x1::new(42);
+        let b = u32x2::new(1, u32::MAX);
+        let r: u64x1 = transmute(vpadal_u32(transmute(a), transmute(b)));
+        let e = u64x1::new(u32::MAX as u64 + 43);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_u8() {
+        let a = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let b = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, u8::MAX);
+        let r: u16x8 = transmute(vpadalq_u8(transmute(a), transmute(b)));
+        let e = u16x8::new(43, 47, 51, 55, 59, 63, 67, u8::MAX as u16 + 56);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_u16() {
+        let a = u32x4::new(42, 42, 42, 42);
+        let b = u16x8::new(0, 1, 2, 3, 4, 5, 6, u16::MAX);
+        let r: u32x4 = transmute(vpadalq_u16(transmute(a), transmute(b)));
+        let e = u32x4::new(43, 47, 51, u16::MAX as u32 + 48);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vpadalq_u32() {
+        let a = u64x2::new(42, 42);
+        let b = u32x4::new(0, 1, 2, u32::MAX);
+        let r: u64x2 = transmute(vpadalq_u32(transmute(a), transmute(b)));
+        let e = u64x2::new(43, u32::MAX as u64 + 44);
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmvn_s8() {
         let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec
index 0343a7232eb7..5d21392eac46 100644
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@@ -1,6 +1,6 @@
 // ARM Neon intrinsic specification.
-// 
-// This file contains the specification for a number of 
+//
+// This file contains the specification for a number of
 // intrinsics that allows us to generate them along with
 // their test cases.
 //
@@ -31,7 +31,7 @@
 //           This is used to generate both aarch64 specific and
 //           shared intrinics by first only specifying th aarch64
 //           variant then the arm variant.
-// 
+//
 // arm     - The arm v7 intrinics used to checked for arm code
 //           generation. All neon functions available in arm are
 //           also available in aarch64. If no aarch64 intrinic was
@@ -60,7 +60,7 @@
 // The special values 'TRUE' and 'FALSE' can be used to
 // represent the corect NEON representation of true or
 // false values. It too gets scaled to the type.
-// 
+//
 // Validate needs to be called before generate as it sets
 // up the rules for validation that get generated for each
 // type.
@@ -168,7 +168,7 @@ generate uint*_t
 /// Floating-point compare greater than
 name = vcgt
 fn = simd_gt
-a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 
+a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
 b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
 
@@ -212,7 +212,7 @@ generate uint*_t
 name = vclt
 fn = simd_lt
 a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
-b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 
+b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
 
 aarch64 = fcmgt
@@ -256,7 +256,7 @@ generate uint*_t
 name = vcle
 fn = simd_le
 a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
-b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 
+b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
 aarch64 = fcmge
 generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
@@ -298,7 +298,7 @@ generate uint*_t
 /// Floating-point compare greater than or equal
 name = vcge
 fn = simd_ge
-a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 
+a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
 b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
 
@@ -383,25 +383,6 @@ link-arm = vqadds._EXT_
 link-aarch64 = sqadd._EXT_
 generate int*_t
 
-// requires 1st and second argument to be different, this not implemented yet
-// /// Signed saturating accumulate of unsigned value
-// 
-// name = vuqadd
-// a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
-// b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-// e = 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
-
-// it seems like we don't have those in rustland :( 
-// aarch64 = suqadd 
-// link-aarch64 = usqadd._EXT_
-// generate int64x*_t
-
-/ arm = suqadd
-// link-arm = vuqadds._EXT_
-// link-aarch64 = suqadd._EXT_
-// generate int*_t
-
-
 /// Multiply
 name = vmul
 a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2