add vmovn_high, vrbit, vrnd, vsubhn neon instructions (#1103)

2021-03-31 22:48:58 +08:00 · 2021-03-31 22:48:58 +08:00 · 7b21d85a41
commit 7b21d85a41
parent ef9ec33482
5 changed files with 1309 additions and 14 deletions
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@ -1816,6 +1816,60 @@ pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uin
    vmlsl_u32(a, b, c)
 }

+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub unsafe fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t {
+    let c: int8x8_t = simd_cast(b);
+    simd_shuffle16(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub unsafe fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t {
+    let c: int16x4_t = simd_cast(b);
+    simd_shuffle8(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub unsafe fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t {
+    let c: int32x2_t = simd_cast(b);
+    simd_shuffle4(a, c, [0, 1, 2, 3])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub unsafe fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t {
+    let c: uint8x8_t = simd_cast(b);
+    simd_shuffle16(a, c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub unsafe fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t {
+    let c: uint16x4_t = simd_cast(b);
+    simd_shuffle8(a, c, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(xtn2))]
+pub unsafe fn vmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
+    let c: uint32x2_t = simd_cast(b);
+    simd_shuffle4(a, c, [0, 1, 2, 3])
+}
+
 /// Negate
 #[inline]
 #[target_feature(enable = "neon")]
@ -1874,6 +1928,428 @@ pub unsafe fn vqnegq_s64(a: int64x2_t) -> int64x2_t {
    vqnegq_s64_(a)
 }

+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn vrbit_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rbit.v8i8")]
+        fn vrbit_s8_(a: int8x8_t) -> int8x8_t;
+    }
+    vrbit_s8_(a)
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn vrbitq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rbit.v16i8")]
+        fn vrbitq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+    vrbitq_s8_(a)
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn vrbit_u8(a: uint8x8_t) -> uint8x8_t {
+    transmute(vrbit_s8(transmute(a)))
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn vrbitq_u8(a: uint8x16_t) -> uint8x16_t {
+    transmute(vrbitq_s8(transmute(a)))
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn vrbit_p8(a: poly8x8_t) -> poly8x8_t {
+    transmute(vrbit_s8(transmute(a)))
+}
+
+/// Reverse bit order
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn vrbitq_p8(a: poly8x16_t) -> poly8x16_t {
+    transmute(vrbitq_s8(transmute(a)))
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub unsafe fn vrndx_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v2f32")]
+        fn vrndx_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndx_f32_(a)
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub unsafe fn vrndxq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v4f32")]
+        fn vrndxq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndxq_f32_(a)
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub unsafe fn vrndx_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v1f64")]
+        fn vrndx_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndx_f64_(a)
+}
+
+/// Floating-point round to integral exact, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintx))]
+pub unsafe fn vrndxq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.rint.v2f64")]
+        fn vrndxq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndxq_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub unsafe fn vrnda_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v2f32")]
+        fn vrnda_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnda_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub unsafe fn vrndaq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v4f32")]
+        fn vrndaq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndaq_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub unsafe fn vrnda_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v1f64")]
+        fn vrnda_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrnda_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to away
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinta))]
+pub unsafe fn vrndaq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.round.v2f64")]
+        fn vrndaq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndaq_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub unsafe fn vrndn_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v2f32")]
+        fn vrndn_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndn_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub unsafe fn vrndnq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v4f32")]
+        fn vrndnq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndnq_f32_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub unsafe fn vrndn_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v1f64")]
+        fn vrndn_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndn_f64_(a)
+}
+
+/// Floating-point round to integral, to nearest with ties to even
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintn))]
+pub unsafe fn vrndnq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v2f64")]
+        fn vrndnq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndnq_f64_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub unsafe fn vrndm_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v2f32")]
+        fn vrndm_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndm_f32_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub unsafe fn vrndmq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v4f32")]
+        fn vrndmq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndmq_f32_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub unsafe fn vrndm_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v1f64")]
+        fn vrndm_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndm_f64_(a)
+}
+
+/// Floating-point round to integral, toward minus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintm))]
+pub unsafe fn vrndmq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.floor.v2f64")]
+        fn vrndmq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndmq_f64_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub unsafe fn vrndp_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v2f32")]
+        fn vrndp_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndp_f32_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub unsafe fn vrndpq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v4f32")]
+        fn vrndpq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndpq_f32_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub unsafe fn vrndp_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v1f64")]
+        fn vrndp_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndp_f64_(a)
+}
+
+/// Floating-point round to integral, toward plus infinity
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintp))]
+pub unsafe fn vrndpq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ceil.v2f64")]
+        fn vrndpq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndpq_f64_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub unsafe fn vrnd_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v2f32")]
+        fn vrnd_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd_f32_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub unsafe fn vrndq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v4f32")]
+        fn vrndq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndq_f32_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub unsafe fn vrnd_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v1f64")]
+        fn vrnd_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrnd_f64_(a)
+}
+
+/// Floating-point round to integral, toward zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frintz))]
+pub unsafe fn vrndq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.trunc.v2f64")]
+        fn vrndq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndq_f64_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub unsafe fn vrndi_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v2f32")]
+        fn vrndi_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrndi_f32_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub unsafe fn vrndiq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v4f32")]
+        fn vrndiq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrndiq_f32_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub unsafe fn vrndi_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v1f64")]
+        fn vrndi_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrndi_f64_(a)
+}
+
+/// Floating-point round to integral, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frinti))]
+pub unsafe fn vrndiq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.nearbyint.v2f64")]
+        fn vrndiq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrndiq_f64_(a)
+}
+
 /// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
@ -5298,6 +5774,60 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_s16() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let b: i16x8 = i16x8::new(2, 3, 4, 5, 12, 13, 14, 15);
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vmovn_high_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_s32() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 3, 4, 5);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let r: i16x8 = transmute(vmovn_high_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_s64() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i64x2 = i64x2::new(2, 3);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmovn_high_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_u16() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let b: u16x8 = u16x8::new(2, 3, 4, 5, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vmovn_high_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_u32() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 3, 4, 5);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 2, 3, 4, 5);
+        let r: u16x8 = transmute(vmovn_high_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmovn_high_u64() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u64x2 = u64x2::new(2, 3);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmovn_high_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vneg_s64() {
        let a: i64x1 = i64x1::new(0);
@ -5346,6 +5876,278 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbit_s8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let e: i8x8 = i8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
+        let r: i8x8 = transmute(vrbit_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbitq_s8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let e: i8x16 = i8x16::new(0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120);
+        let r: i8x16 = transmute(vrbitq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbit_u8() {
+        let a: u8x8 = u8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let e: u8x8 = u8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
+        let r: u8x8 = transmute(vrbit_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbitq_u8() {
+        let a: u8x16 = u8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let e: u8x16 = u8x16::new(0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120);
+        let r: u8x16 = transmute(vrbitq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbit_p8() {
+        let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
+        let e: i8x8 = i8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
+        let r: i8x8 = transmute(vrbit_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrbitq_p8() {
+        let a: i8x16 = i8x16::new(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+        let e: i8x16 = i8x16::new(0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120);
+        let r: i8x16 = transmute(vrbitq_p8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndx_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndx_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndxq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndxq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndx_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndx_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndxq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndxq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnda_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 1.0);
+        let r: f32x2 = transmute(vrnda_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndaq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 1.0, 2.0, 3.0);
+        let r: f32x4 = transmute(vrndaq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnda_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnda_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndaq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 1.0);
+        let r: f64x2 = transmute(vrndaq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndn_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndn_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndnq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndnq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndn_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndn_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndnq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndnq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndm_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndm_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndmq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 1.0, 2.0);
+        let r: f32x4 = transmute(vrndmq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndm_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndm_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndmq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndmq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndp_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-1.0, 1.0);
+        let r: f32x2 = transmute(vrndp_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndpq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-1.0, 1.0, 2.0, 3.0);
+        let r: f32x4 = transmute(vrndpq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndp_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -1.0;
+        let r: f64 = transmute(vrndp_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndpq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-1.0, 1.0);
+        let r: f64x2 = transmute(vrndpq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnd_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-1.0, 0.0);
+        let r: f32x2 = transmute(vrnd_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-1.0, 0.0, 1.0, 2.0);
+        let r: f32x4 = transmute(vrndq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrnd_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -1.0;
+        let r: f64 = transmute(vrnd_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-1.0, 0.0);
+        let r: f64x2 = transmute(vrndq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndi_f32() {
+        let a: f32x2 = f32x2::new(-1.5, 0.5);
+        let e: f32x2 = f32x2::new(-2.0, 0.0);
+        let r: f32x2 = transmute(vrndi_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndiq_f32() {
+        let a: f32x4 = f32x4::new(-1.5, 0.5, 1.5, 2.5);
+        let e: f32x4 = f32x4::new(-2.0, 0.0, 2.0, 2.0);
+        let r: f32x4 = transmute(vrndiq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndi_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrndi_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrndiq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 0.5);
+        let e: f64x2 = f64x2::new(-2.0, 0.0);
+        let r: f64x2 = transmute(vrndiq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vmul_f64() {
        let a: f64 = 1.0;
--- a/library/stdarch/crates/core_arch/src/arm/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon/generated.rs
@ -3948,6 +3948,138 @@ pub unsafe fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
    simd_sub(a, b)
 }

+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    let c: i32x4 = i32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    let c: i64x2 = i64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let c: u32x4 = u32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let c: u64x2 = u64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let d: int8x8_t = vsubhn_s16(b, c);
+    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let d: int16x4_t = vsubhn_s32(b, c);
+    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let d: int32x2_t = vsubhn_s64(b, c);
+    simd_shuffle4(a, d, [0, 1, 2, 3])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let d: uint8x8_t = vsubhn_u16(b, c);
+    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let d: uint16x4_t = vsubhn_u32(b, c);
+    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Subtract returning high narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let d: uint32x2_t = vsubhn_u64(b, c);
+    simd_shuffle4(a, d, [0, 1, 2, 3])
+}
+
 /// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
@ -10222,6 +10354,120 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_s16() {
+        let a: i16x8 = i16x8::new(0x7F_FF, -32768, 1, 1, 0x7F_FF, -32768, 1, 1);
+        let b: i16x8 = i16x8::new(1, 0, 0, 0, 1, 0, 0, 0);
+        let e: i8x8 = i8x8::new(0x7F, -128, 0, 0, 0x7F, -128, 0, 0);
+        let r: i8x8 = transmute(vsubhn_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_s32() {
+        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, -2147483648, 1, 1);
+        let b: i32x4 = i32x4::new(1, 0, 0, 0);
+        let e: i16x4 = i16x4::new(0x7F_FF, -32768, 0, 0);
+        let r: i16x4 = transmute(vsubhn_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_s64() {
+        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, -9223372036854775808);
+        let b: i64x2 = i64x2::new(1, 0);
+        let e: i32x2 = i32x2::new(0x7F_FF_FF_FF, -2147483648);
+        let r: i32x2 = transmute(vsubhn_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_u16() {
+        let a: u16x8 = u16x8::new(0xFF_FF, 0, 1, 1, 0xFF_FF, 0, 1, 1);
+        let b: u16x8 = u16x8::new(1, 0, 0, 0, 1, 0, 0, 0);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0, 0, 0xFF, 0, 0, 0);
+        let r: u8x8 = transmute(vsubhn_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_u32() {
+        let a: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 1, 1);
+        let b: u32x4 = u32x4::new(1, 0, 0, 0);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0, 0);
+        let r: u16x4 = transmute(vsubhn_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_u64() {
+        let a: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let b: u64x2 = u64x2::new(1, 0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vsubhn_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_s16() {
+        let a: i8x8 = i8x8::new(0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0);
+        let b: i16x8 = i16x8::new(0x7F_FF, 1, 0x7F_FF, 1, 0x7F_FF, 1, 0x7F_FF, 1);
+        let c: i16x8 = i16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
+        let e: i8x16 = i8x16::new(0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0, 0x7F, 0);
+        let r: i8x16 = transmute(vsubhn_high_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_s32() {
+        let a: i16x4 = i16x4::new(0x7F_FF, 0, 0x7F_FF, 0);
+        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 1, 0x7F_FF_FF_FF, 1);
+        let c: i32x4 = i32x4::new(1, 0, 1, 0);
+        let e: i16x8 = i16x8::new(0x7F_FF, 0, 0x7F_FF, 0, 0x7F_FF, 0, 0x7F_FF, 0);
+        let r: i16x8 = transmute(vsubhn_high_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_s64() {
+        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0);
+        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 1);
+        let c: i64x2 = i64x2::new(1, 0);
+        let e: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0, 0x7F_FF_FF_FF, 0);
+        let r: i32x4 = transmute(vsubhn_high_s64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_u16() {
+        let a: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let b: u16x8 = u16x8::new(0xFF_FF, 1, 0xFF_FF, 1, 0xFF_FF, 1, 0xFF_FF, 1);
+        let c: u16x8 = u16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vsubhn_high_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_u32() {
+        let a: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let b: u32x4 = u32x4::new(0xFF_FF_FF_FF, 1, 0xFF_FF_FF_FF, 1);
+        let c: u32x4 = u32x4::new(1, 0, 1, 0);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vsubhn_high_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsubhn_high_u64() {
+        let a: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let b: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 1);
+        let c: u64x2 = u64x2::new(1, 0);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vsubhn_high_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vhsub_u8() {
        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
--- a/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon/mod.rs
@ -5391,6 +5391,46 @@ pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
+    simd_shuffle4(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
+    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
+    simd_shuffle4(a, a, [1, 0, 3, 2])
+}
+
+/// Reversing vector elements (swap endianness)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
+pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
+    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+}
+
 /// Reversing vector elements (swap endianness)
 #[inline]
 #[target_feature(enable = "neon")]
@ -10792,6 +10832,34 @@ mod tests {
        assert_eq!(r, e);
    }
    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(1, 0, 3, 2);
+        let e: i16x4 = transmute(vrev32_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i16x8 = transmute(vrev32q_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32_p16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let r = i16x4::new(1, 0, 3, 2);
+        let e: i16x4 = transmute(vrev32_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrev32q_p16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = i16x8::new(1, 0, 3, 2, 5, 4, 7, 6);
+        let e: i16x8 = transmute(vrev32q_p16(transmute(a)));
+        assert_eq!(r, e);
+    }
+    #[simd_test(enable = "neon")]
    unsafe fn test_vrev32_u16() {
        let a = u16x4::new(0, 1, 2, 3);
        let r = u16x4::new(1, 0, 3, 2);
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@ -1050,6 +1050,19 @@ validate 14, 13, 12, 11, 10, 9, 8, 7
 aarch64 = umlsl2
 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t

+/// Extract narrow
+name = vmovn_high
+no-q
+multi_fn = simd_cast, c:in_t0, b
+multi_fn = simd_shuffle-out_len-noext, a, c, {asc-out_len}
+a = 0, 1, 2, 3, 2, 3, 4, 5
+b = 2, 3, 4, 5, 12, 13, 14, 15
+validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
+
+aarch64 = xtn2
+generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
+
 /// Negate
 name = vneg
 fn = simd_neg
@ -1111,20 +1124,38 @@ a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29

-
 arm = vhadd.s
 aarch64 = uhadd
 link-aarch64 = uhadd._EXT_
 link-arm = vhaddu._EXT_
 generate uint*_t

-
 arm = vhadd.s
 aarch64 = shadd
 link-aarch64 = shadd._EXT_
 link-arm = vhadds._EXT_
 generate int*_t

+/// Reverse bit order
+name = vrbit
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120
+
+aarch64 = rbit
+link-aarch64 = rbit._EXT_
+
+generate int8x8_t, int8x16_t
+
+/// Reverse bit order
+name = vrbit
+multi_fn = transmute, {vrbit-signed-noext, transmute(a)}
+a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120
+
+aarch64 = rbit
+
+generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t
+
 /// Rounding halving add
 name = vrhadd
 a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
@ -1143,6 +1174,69 @@ link-arm = vrhadds._EXT_
 link-aarch64 = srhadd._EXT_
 generate int*_t

+/// Floating-point round to integral exact, using current rounding mode
+name = vrndx
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 2.0, 2.0
+
+aarch64 = frintx
+link-aarch64 = llvm.rint._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, to nearest with ties to away
+name = vrnda
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 1.0, 2.0, 3.0
+
+aarch64 = frinta
+link-aarch64 = llvm.round._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, to nearest with ties to even
+name = vrndn
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 2.0, 2.0
+
+link-aarch64 = frintn._EXT_
+aarch64 = frintn
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, toward minus infinity
+name = vrndm
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 1.0, 2.0
+
+aarch64 = frintm
+link-aarch64 = llvm.floor._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, toward plus infinity
+name = vrndp
+a = -1.5, 0.5, 1.5, 2.5
+validate -1.0, 1.0, 2.0, 3.0
+
+aarch64 = frintp
+link-aarch64 = llvm.ceil._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, toward zero
+name = vrnd
+a = -1.5, 0.5, 1.5, 2.5
+validate -1.0, 0.0, 1.0, 2.0
+
+aarch64 = frintz
+link-aarch64 = llvm.trunc._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point round to integral, using current rounding mode
+name = vrndi
+a = -1.5, 0.5, 1.5, 2.5
+validate -2.0, 0.0, 2.0, 2.0
+
+aarch64 = frinti
+link-aarch64 = llvm.nearbyint._EXT_
+generate float*_t, float64x*_t
+
 /// Saturating add
 name = vqadd
 a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
@ -1295,6 +1389,35 @@ generate float64x*_t
 arm = vsub.
 generate float*_t

+/// Subtract returning high narrow
+name = vsubhn
+no-q
+multi_fn = fixed, c:in_t
+multi_fn = simd_cast, {simd_shr, {simd_sub}, transmute(c)}
+a = MAX, MIN, 1, 1, MAX, MIN, 1, 1
+b = 1, 0, 0, 0, 1, 0, 0, 0
+fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS
+validate MAX, MIN, 0, 0, MAX, MIN, 0, 0
+
+arm = vsubhn
+aarch64 = subhn
+generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
+generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
+
+/// Subtract returning high narrow
+name = vsubhn_high
+no-q
+multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
+multi_fn = simd_shuffle-out_len-noext, a, d, {asc-out_len}
+a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
+b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
+c = 1, 0, 1, 0, 1, 0, 1, 0
+validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0
+
+arm = vsubhn
+aarch64 = subhn2
+generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t
+generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t

 /// Signed halving subtract
 name = vhsub
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@ -119,14 +119,14 @@ fn type_to_suffix(t: &str) -> &str {

 fn type_to_signed_suffix(t: &str) -> &str {
    match t {
-        "int8x8_t" | "uint8x8_t" => "_s8",
-        "int8x16_t" | "uint8x16_t" => "q_s8",
-        "int16x4_t" | "uint16x4_t" => "_s16",
-        "int16x8_t" | "uint16x8_t" => "q_s16",
+        "int8x8_t" | "uint8x8_t" | "poly8x8_t" => "_s8",
+        "int8x16_t" | "uint8x16_t" | "poly8x16_t" => "q_s8",
+        "int16x4_t" | "uint16x4_t" | "poly16x4_t" => "_s16",
+        "int16x8_t" | "uint16x8_t" | "poly16x8_t" => "q_s16",
        "int32x2_t" | "uint32x2_t" => "_s32",
        "int32x4_t" | "uint32x4_t" => "q_s32",
-        "int64x1_t" | "uint64x1_t" => "_s64",
-        "int64x2_t" | "uint64x2_t" => "q_s64",
+        "int64x1_t" | "uint64x1_t" | "poly64x1_t" => "_s64",
+        "int64x2_t" | "uint64x2_t" | "poly64x2_t" => "q_s64",
        /*
        "float16x4_t" => "_f16",
        "float16x8_t" => "q_f16",
@ -328,6 +328,16 @@ fn type_to_half(t: &str) -> &str {
    }
 }

+fn asc(x: usize) -> &'static str {
+    match x {
+        2 => "[0, 1]",
+        4 => "[0, 1, 2, 3]",
+        8 => "[0, 1, 2, 3, 4, 5, 6, 7]",
+        16 => "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]",
+        _ => panic!("unknown transpose order of len {}", x),
+    }
+}
+
 fn transpose1(x: usize) -> &'static str {
    match x {
        2 => "[0, 2]",
@ -481,6 +491,23 @@ fn bits_minus_one(t: &str) -> &'static str {
    }
 }

+fn half_bits(t: &str) -> &'static str {
+    match &t[..3] {
+        "u8x" => "4",
+        "u16" => "8",
+        "u32" => "16",
+        "u64" => "32",
+        "i8x" => "4",
+        "i16" => "8",
+        "i32" => "16",
+        "i64" => "32",
+        "p8x" => "4",
+        "p16" => "8",
+        "p64" => "32",
+        _ => panic!("Unknown bits for type {}", t),
+    }
+}
+
 fn map_val<'v>(t: &str, v: &'v str) -> &'v str {
    match v {
        "FALSE" => false_val(t),
@ -490,6 +517,7 @@ fn map_val<'v>(t: &str, v: &'v str) -> &'v str {
        "FF" => ff_val(t),
        "BITS" => bits(t),
        "BITS_M1" => bits_minus_one(t),
+        "HFBITS" => half_bits(t),
        o => o,
    }
 }
@ -554,14 +582,21 @@ fn gen_aarch64(
    let ext_c = if let Some(link_aarch64) = link_aarch64.clone() {
        let ext = type_to_ext(in_t[0]);
        let ext2 = type_to_ext(out_t);
+        let link_aarch64 = if link_aarch64.starts_with("llvm") {
+            link_aarch64.replace("_EXT_", ext).replace("_EXT2_", ext2)
+        } else {
+            let mut link = String::from("llvm.aarch64.neon.");
+            link.push_str(&link_aarch64);
+            link.replace("_EXT_", ext).replace("_EXT2_", ext2)
+        };
        format!(
            r#"#[allow(improper_ctypes)]
    extern "C" {{
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.{}")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
    "#,
-            link_aarch64.replace("_EXT_", ext).replace("_EXT2_", ext2),
+            link_aarch64,
            current_fn,
            match para_num {
                1 => {
@ -817,16 +852,30 @@ fn gen_arm(
        if let (Some(link_arm), Some(link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) {
            let ext = type_to_ext(in_t[0]);
            let ext2 = type_to_ext(out_t);
+            let link_arm = if link_arm.starts_with("llvm") {
+                link_arm.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            } else {
+                let mut link = String::from("llvm.arm.neon.");
+                link.push_str(&link_arm);
+                link.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            };
+            let link_aarch64 = if link_aarch64.starts_with("llvm") {
+                link_aarch64.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            } else {
+                let mut link = String::from("llvm.aarch64.neon.");
+                link.push_str(&link_aarch64);
+                link.replace("_EXT_", ext).replace("_EXT2_", ext2)
+            };
            format!(
                r#"#[allow(improper_ctypes)]
    extern "C" {{
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.{}")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.{}")]
+        #[cfg_attr(target_arch = "arm", link_name = "{}")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
 "#,
-                link_arm.replace("_EXT_", ext).replace("_EXT2_", ext2),
-                link_aarch64.replace("_EXT_", ext).replace("_EXT2_", ext2),
+                link_arm,
+                link_aarch64,
                current_fn,
                match para_num {
                    1 => {
@ -1066,6 +1115,10 @@ fn get_call(
                re = Some((re_params[0].clone(), in_t[1].to_string()));
            } else if re_params[1] == "in_t" {
                re = Some((re_params[0].clone(), in_t[1].to_string()));
+            } else if re_params[1] == "in_t0" {
+                re = Some((re_params[0].clone(), in_t[0].to_string()));
+            } else if re_params[1] == "in_t1" {
+                re = Some((re_params[0].clone(), in_t[1].to_string()));
            } else if re_params[1] == "out_t" {
                re = Some((re_params[0].clone(), out_t.to_string()));
            } else if re_params[1] == "half" {
@ -1097,6 +1150,9 @@ fn get_call(
            });
        return format!(r#"[{}]"#, &half[..half.len() - 2]);
    }
+    if fn_name == "asc-out_len" {
+        return asc(type_len(out_t)).to_string();
+    }
    if fn_name == "transpose-1-in_len" {
        return transpose1(type_len(in_t[1])).to_string();
    }