Add vmul_n, vmul_lane, vmulx neon instructions (#1147)

2021-05-01 04:09:41 +08:00 · 2021-05-01 04:09:41 +08:00 · fd29f9602c
commit fd29f9602c
parent 07f1d0cae3
5 changed files with 2263 additions and 118 deletions
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@ -3934,6 +3934,106 @@ pub unsafe fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
    simd_mul(a, b)
 }

+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+pub unsafe fn vmul_n_f64(a: float64x1_t, b: f64) -> float64x1_t {
+    simd_mul(a, vdup_n_f64(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul))]
+pub unsafe fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t {
+    simd_mul(a, vdupq_n_f64(b))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_mul(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmuls_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
+    static_assert_imm1!(LANE);
+    let b: f32 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmuls_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
+    static_assert_imm2!(LANE);
+    let b: f32 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmuld_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
+    static_assert!(LANE : i32 where LANE == 0);
+    let b: f64 = simd_extract(b, LANE as u32);
+    a * b
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmuld_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
+    static_assert_imm1!(LANE);
+    let b: f64 = simd_extract(b, LANE as u32);
+    a * b
+}
+
 /// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
@ -4004,6 +4104,316 @@ pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t {
    vmull_p8(a, b)
 }

+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+pub unsafe fn vmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t {
+    vmull_high_s16(a, vdupq_n_s16(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2))]
+pub unsafe fn vmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t {
+    vmull_high_s32(a, vdupq_n_s32(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+pub unsafe fn vmull_high_n_u16(a: uint16x8_t, b: u16) -> uint32x4_t {
+    vmull_high_u16(a, vdupq_n_u16(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2))]
+pub unsafe fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t {
+    vmull_high_u32(a, vdupq_n_u32(b))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_high_s16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_high_s16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_high_s32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_high_s32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_high_u16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_high_u16(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_high_u32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umull2, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_high_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_high_u32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub unsafe fn vmulx_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v2f32")]
+        fn vmulx_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+    vmulx_f32_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub unsafe fn vmulxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v4f32")]
+        fn vmulxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+    vmulxq_f32_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub unsafe fn vmulx_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v1f64")]
+        fn vmulx_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
+    }
+    vmulx_f64_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub unsafe fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.v2f64")]
+        fn vmulxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    }
+    vmulxq_f64_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulx_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vmulx_f64(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulx_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
+    static_assert_imm1!(LANE);
+    vmulx_f64(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vmulx_f32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vmulx_f32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vmulxq_f32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vmulxq_f32(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
+    static_assert!(LANE : i32 where LANE == 0);
+    vmulxq_f64(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
+    static_assert_imm1!(LANE);
+    vmulxq_f64(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub unsafe fn vmulxs_f32(a: f32, b: f32) -> f32 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.f32")]
+        fn vmulxs_f32_(a: f32, b: f32) -> f32;
+    }
+    vmulxs_f32_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx))]
+pub unsafe fn vmulxd_f64(a: f64, b: f64) -> f64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmulx.f64")]
+        fn vmulxd_f64_(a: f64, b: f64) -> f64;
+    }
+    vmulxd_f64_(a, b)
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxs_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
+    static_assert_imm1!(LANE);
+    vmulxs_f32(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxs_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
+    static_assert_imm2!(LANE);
+    vmulxs_f32(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxd_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
+    static_assert!(LANE : i32 where LANE == 0);
+    vmulxd_f64(a, simd_extract(b, LANE as u32))
+}
+
+/// Floating-point multiply extended
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmulx, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulxd_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
+    static_assert_imm1!(LANE);
+    vmulxd_f64(a, simd_extract(b, LANE as u32))
+}
+
 /// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
@ -10814,6 +11224,96 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmul_n_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulq_n_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmul_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmul_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulq_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulq_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuls_lane_f32() {
+        let a: f32 = 1.;
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vmuls_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuls_laneq_f32() {
+        let a: f32 = 1.;
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32 = 2.;
+        let r: f32 = transmute(vmuls_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuld_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmuld_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmuld_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmuld_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vmull_high_s8() {
        let a: i8x16 = i8x16::new(1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16);
@ -10877,6 +11377,276 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16 = 2;
+        let e: i32x4 = i32x4::new(18, 20, 22, 24);
+        let r: i32x4 = transmute(vmull_high_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32 = 2;
+        let e: i64x2 = i64x2::new(18, 20);
+        let r: i64x2 = transmute(vmull_high_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16 = 2;
+        let e: u32x4 = u32x4::new(18, 20, 22, 24);
+        let r: u32x4 = transmute(vmull_high_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_n_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32 = 2;
+        let e: u64x2 = u64x2::new(18, 20);
+        let r: u64x2 = transmute(vmull_high_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(18, 20, 22, 24);
+        let r: i32x4 = transmute(vmull_high_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(18, 20, 22, 24);
+        let r: i32x4 = transmute(vmull_high_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(18, 20);
+        let r: i64x2 = transmute(vmull_high_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 9, 10);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(18, 20);
+        let r: i64x2 = transmute(vmull_high_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(18, 20, 22, 24);
+        let r: u32x4 = transmute(vmull_high_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 9, 10, 9, 10, 11, 12);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(18, 20, 22, 24);
+        let r: u32x4 = transmute(vmull_high_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(18, 20);
+        let r: u64x2 = transmute(vmull_high_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_high_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 9, 10);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(18, 20);
+        let r: u64x2 = transmute(vmull_high_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmulx_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulxq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmulx_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(2., 2.);
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulxq_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_lane_f64() {
+        let a: f64 = 1.;
+        let b: f64 = 2.;
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmulx_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_laneq_f64() {
+        let a: f64 = 1.;
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64 = 2.;
+        let r: f64 = transmute(vmulx_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmulx_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulx_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmulx_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_lane_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulxq_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulxq_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_lane_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64 = 2.;
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulxq_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxq_laneq_f64() {
+        let a: f64x2 = f64x2::new(1., 2.);
+        let b: f64x2 = f64x2::new(2., 0.);
+        let e: f64x2 = f64x2::new(2., 4.);
+        let r: f64x2 = transmute(vmulxq_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxs_f32() {
+        let a: f32 = 2.;
+        let b: f32 = 3.;
+        let e: f32 = 6.;
+        let r: f32 = transmute(vmulxs_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxd_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 3.;
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmulxd_f64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxs_lane_f32() {
+        let a: f32 = 2.;
+        let b: f32x2 = f32x2::new(3., 0.);
+        let e: f32 = 6.;
+        let r: f32 = transmute(vmulxs_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxs_laneq_f32() {
+        let a: f32 = 2.;
+        let b: f32x4 = f32x4::new(3., 0., 0., 0.);
+        let e: f32 = 6.;
+        let r: f32 = transmute(vmulxs_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxd_lane_f64() {
+        let a: f64 = 2.;
+        let b: f64 = 3.;
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmulxd_lane_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulxd_laneq_f64() {
+        let a: f64 = 2.;
+        let b: f64x2 = f64x2::new(3., 0.);
+        let e: f64 = 6.;
+        let r: f64 = transmute(vmulxd_laneq_f64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vfma_f64() {
        let a: f64 = 2.0;
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@ -108,9 +108,6 @@ extern "C" {
    #[link_name = "llvm.aarch64.neon.usqadd.v2i64"]
    fn vsqaddq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;

-    #[link_name = "llvm.aarch64.neon.pmull64"]
-    fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
-
    #[link_name = "llvm.aarch64.neon.addp.v8i16"]
    fn vpaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
    #[link_name = "llvm.aarch64.neon.addp.v4i32"]
@ -1150,14 +1147,6 @@ pub unsafe fn vaddlvq_u8(a: uint8x16_t) -> u16 {
    vaddlvq_u8_(a) as u16
 }

-/// Polynomial multiply long
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(pmull))]
-pub unsafe fn vmull_p64(a: p64, b: p64) -> p128 {
-    transmute(vmull_p64_(transmute(a), transmute(b)))
-}
-
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
@ -3260,36 +3249,6 @@ mod tests {
        assert_eq!(r, e);
    }

-    #[simd_test(enable = "neon")]
-    unsafe fn test_vmull_p64() {
-        // FIXME: I've a hard time writing a test for this as the documentation
-        // from arm is a bit thin as to waht exactly it does
-        let a: i64 = 8;
-        let b: i64 = 7;
-        let e: i128 = 56;
-        let r: i128 = transmute(vmull_p64(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        /*
-        let a: i64 = 5;
-        let b: i64 = 5;
-        let e: i128 = 25;
-        let r: i128 = transmute(vmull_p64(a, b));
-
-        assert_eq!(r, e);
-        let a: i64 = 6;
-        let b: i64 = 6;
-        let e: i128 = 36;
-        let r: i128 = transmute(vmull_p64(a, b));
-        assert_eq!(r, e);
-
-        let a: i64 = 7;
-        let b: i64 = 6;
-        let e: i128 = 42;
-        let r: i128 = transmute(vmull_p64(a, b));
-        assert_eq!(r, e);
-        */
-    }
    #[simd_test(enable = "neon")]
    unsafe fn test_vadd_f64() {
        let a = 1.;
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@ -5558,6 +5558,38 @@ pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
    simd_mul(a, b)
 }

+/// Polynomial multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
+pub unsafe fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v8i8")]
+        fn vmul_p8_(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t;
+    }
+vmul_p8_(a, b)
+}
+
+/// Polynomial multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmul))]
+pub unsafe fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulp.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmul.v16i8")]
+        fn vmulq_p8_(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t;
+    }
+vmulq_p8_(a, b)
+}
+
 /// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
@ -5578,6 +5610,346 @@ pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
    simd_mul(a, b)
 }

+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    simd_mul(a, vdup_n_s16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    simd_mul(a, vdupq_n_s16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    simd_mul(a, vdup_n_s32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    simd_mul(a, vdupq_n_s32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t {
+    simd_mul(a, vdup_n_u16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t {
+    simd_mul(a, vdupq_n_u16(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t {
+    simd_mul(a, vdup_n_u32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t {
+    simd_mul(a, vdupq_n_u32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t {
+    simd_mul(a, vdup_n_f32(b))
+}
+
+/// Vector multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t {
+    simd_mul(a, vdupq_n_f32(b))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    simd_mul(a, simd_shuffle8(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmul_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Floating-point multiply
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmul, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmulq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    simd_mul(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
 /// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
@ -5690,6 +6062,142 @@ pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
 vmull_p8_(a, b)
 }

+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmullh_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vmull_s16(a, vdup_n_s16(b))
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmulls_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vmull_s32(a, vdup_n_s32(b))
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmullh_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t {
+    vmull_u16(a, vdup_n_u16(b))
+}
+
+/// Vector long multiply with scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmulls_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t {
+    vmull_u32(a, vdup_n_u32(b))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_s16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_s16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_s32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_s32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmull_u16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmull_u16(a, simd_shuffle4(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmull_u32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
+/// Vector long multiply by scalar
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull, LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull, LANE = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vmull_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmull_u32(a, simd_shuffle2(b, b, [LANE as u32, LANE as u32]))
+}
+
 /// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
@ -17013,6 +17521,24 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_p8() {
+        let a: i8x8 = i8x8::new(1, 3, 1, 3, 1, 3, 1, 3);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 6, 3, 12, 5, 10, 7, 24);
+        let r: i8x8 = transmute(vmul_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_p8() {
+        let a: i8x16 = i8x16::new(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48);
+        let r: i8x16 = transmute(vmulq_p8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vmul_f32() {
        let a: f32x2 = f32x2::new(1.0, 2.0);
@ -17031,6 +17557,276 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16 = 2;
+        let e: i16x4 = i16x4::new(2, 4, 6, 8);
+        let r: i16x4 = transmute(vmul_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16 = 2;
+        let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: i16x8 = transmute(vmulq_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32 = 2;
+        let e: i32x2 = i32x2::new(2, 4);
+        let r: i32x2 = transmute(vmul_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32 = 2;
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmulq_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16 = 2;
+        let e: u16x4 = u16x4::new(2, 4, 6, 8);
+        let r: u16x4 = transmute(vmul_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16 = 2;
+        let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: u16x8 = transmute(vmulq_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32 = 2;
+        let e: u32x2 = u32x2::new(2, 4);
+        let r: u32x2 = transmute(vmul_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32 = 2;
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmulq_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_n_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32 = 2.;
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmul_n_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_n_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32 = 2.;
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulq_n_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x4 = i16x4::new(2, 4, 6, 8);
+        let r: i16x4 = transmute(vmul_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(2, 4, 6, 8);
+        let r: i16x4 = transmute(vmul_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: i16x8 = transmute(vmulq_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: i16x8 = transmute(vmulq_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x2 = i32x2::new(2, 4);
+        let r: i32x2 = transmute(vmul_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x2 = i32x2::new(2, 4);
+        let r: i32x2 = transmute(vmul_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmulq_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmulq_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u16x4 = u16x4::new(2, 4, 6, 8);
+        let r: u16x4 = transmute(vmul_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(2, 4, 6, 8);
+        let r: u16x4 = transmute(vmul_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: u16x8 = transmute(vmulq_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        let r: u16x8 = transmute(vmulq_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u32x2 = u32x2::new(2, 4);
+        let r: u32x2 = transmute(vmul_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u32x2 = u32x2::new(2, 4);
+        let r: u32x2 = transmute(vmul_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmulq_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmulq_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmul_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmul_laneq_f32() {
+        let a: f32x2 = f32x2::new(1., 2.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x2 = f32x2::new(2., 4.);
+        let r: f32x2 = transmute(vmul_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_lane_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x2 = f32x2::new(2., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulq_lane_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 2., 3., 4.);
+        let b: f32x4 = f32x4::new(2., 0., 0., 0.);
+        let e: f32x4 = f32x4::new(2., 4., 6., 8.);
+        let r: f32x4 = transmute(vmulq_laneq_f32::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vmull_s8() {
        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
@ -17094,6 +17890,114 @@ mod test {
        assert_eq!(r, e);
    }

+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmullh_n_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16 = 2;
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmullh_n_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulls_n_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32 = 2;
+        let e: i64x2 = i64x2::new(2, 4);
+        let r: i64x2 = transmute(vmulls_n_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmullh_n_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16 = 2;
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmullh_n_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmulls_n_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32 = 2;
+        let e: u64x2 = u64x2::new(2, 4);
+        let r: u64x2 = transmute(vmulls_n_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmull_lane_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(2, 4, 6, 8);
+        let r: i32x4 = transmute(vmull_laneq_s16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(2, 4);
+        let r: i64x2 = transmute(vmull_lane_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(2, 4);
+        let r: i64x2 = transmute(vmull_laneq_s32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmull_lane_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(2, 4, 6, 8);
+        let r: u32x4 = transmute(vmull_laneq_u16::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(2, 4);
+        let r: u64x2 = transmute(vmull_lane_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmull_laneq_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(2, 4);
+        let r: u64x2 = transmute(vmull_laneq_u32::<1>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
    #[simd_test(enable = "neon")]
    unsafe fn test_vfma_f32() {
        let a: f32x2 = f32x2::new(2.0, 3.0);
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@ -338,7 +338,7 @@ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t,

 /// Signed compare bitwise Test bits nonzero
 name = vtst
-multi_fn = simd_and, c:in_t
+multi_fn = simd_and, c:in_t, a, b
 multi_fn = fixed, d:in_t
 multi_fn = simd_ne, c, transmute(d)
 a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
@ -354,7 +354,7 @@ generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8

 /// Unsigned compare bitwise Test bits nonzero
 name = vtst
-multi_fn = simd_and, c:in_t
+multi_fn = simd_and, c:in_t, a, b
 multi_fn = fixed, d:in_t
 multi_fn = simd_ne, c, transmute(d)
 a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
@ -1864,6 +1864,18 @@ aarch64 = mul
 fn = simd_mul
 generate int*_t, uint*_t

+/// Polynomial multiply
+name = vmul
+a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48
+
+aarch64 = pmul
+link-aarch64 = pmul._EXT_
+arm = vmul
+link-arm = vmulp._EXT_
+generate poly8x8_t, poly8x16_t
+
 /// Multiply
 name = vmul
 fn = simd_mul
@ -1877,6 +1889,108 @@ generate float64x*_t
 arm = vmul.
 generate float*_t

+/// Vector multiply by scalar
+name = vmul
+out-n-suffix
+multi_fn = simd_mul, a, {vdup-nout-noext, b}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2
+validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+
+arm = vmul
+aarch64 = mul
+generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t
+generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t
+
+/// Vector multiply by scalar
+name = vmul
+out-n-suffix
+multi_fn = simd_mul, a, {vdup-nout-noext, b}
+a = 1., 2., 3., 4.
+b = 2.
+validate 2., 4., 6., 8.
+
+aarch64 = fmul
+generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t
+
+arm = vmul
+generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t
+
+/// Multiply
+name = vmul
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+
+aarch64 = mul
+arm = vmul
+generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t
+generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
+generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
+
+/// Floating-point multiply
+name = vmul
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
+a = 1., 2., 3., 4.
+b = 2., 0., 0., 0.
+n = 0
+validate 2., 4., 6., 8.
+
+aarch64 = fmul
+generate float64x1_t, float64x1_t:float64x2_t:float64x1_t
+
+/// Floating-point multiply
+name = vmul
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
+a = 1., 2., 3., 4.
+b = 2., 0., 0., 0.
+n = 0
+validate 2., 4., 6., 8.
+
+aarch64 = fmul
+generate float64x2_t:float64x1_t:float64x2_t, float64x2_t
+
+arm = vmul
+generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
+/// Floating-point multiply
+name = vmuls_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_extract, b:f32, b, LANE as u32
+multi_fn = a * b
+a = 1.
+b = 2., 0., 0., 0.
+n = 0
+validate 2.
+aarch64 = fmul
+generate f32:float32x2_t:f32, f32:float32x4_t:f32
+
+/// Floating-point multiply
+name = vmuld_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = simd_extract, b:f64, b, LANE as u32
+multi_fn = a * b
+a = 1.
+b = 2., 0.
+n = 0
+validate 2.
+aarch64 = fmul
+generate f64:float64x1_t:f64, f64:float64x2_t:f64
+
 /// Signed multiply long
 name = vmull
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@ -1941,6 +2055,21 @@ link-arm = vmullp._EXT_
 link-aarch64 = pmull._EXT_
 generate poly8x8_t:poly8x8_t:poly16x8_t

+/// Polynomial multiply long
+name = vmull
+no-q
+a = 15
+b = 3
+validate 17
+target = crypto
+
+aarch64 = pmull
+link-aarch64 = pmull64:p64:p64:p64:int8x16_t
+arm = vmull
+link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t
+//generate p64:p64:p128
+
+
 /// Polynomial multiply long
 name = vmull_high
 no-q
@ -1955,6 +2084,144 @@ validate 9, 30, 11, 20, 13, 18, 15, 48
 aarch64 = pmull
 generate poly8x16_t:poly8x16_t:poly16x8_t

+/// Polynomial multiply long
+name = vmull_high
+no-q
+multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1}
+a = 1, 15
+b = 1, 3
+validate 17
+target = crypto
+
+aarch64 = pmull2
+//generate poly64x2_t:poly64x2_t:p128
+
+/// Vector long multiply with scalar
+name = vmull
+n-suffix
+multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b}
+a = 1, 2, 3, 4, 5, 6, 7, 8
+b = 2
+validate 2, 4, 6, 8, 10, 12, 14, 16
+
+arm = vmull
+aarch64 = smull
+generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
+aarch64 = umull
+generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
+
+/// Vector long multiply by scalar
+name = vmull_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
+
+arm = vmull
+aarch64 = smull
+generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t
+generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t
+aarch64 = umull
+generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t
+generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t
+
+/// Multiply long
+name = vmull_high_n
+no-q
+multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b}
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 2
+validate 18, 20, 22, 24, 26, 28, 30, 32
+
+aarch64 = smull2
+generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
+aarch64 = umull2
+generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
+
+/// Multiply long
+name = vmull_high_lane
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
+b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 18, 20, 22, 24, 26, 28, 30, 32
+
+aarch64 = smull2
+generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t
+generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t
+aarch64 = umull2
+generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t
+generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t
+
+/// Floating-point multiply extended
+name = vmulx
+a = 1., 2., 3., 4.
+b = 2., 2., 2., 2.
+validate 2., 4., 6., 8.
+
+aarch64 = fmulx
+link-aarch64 = fmulx._EXT_
+generate float*_t, float64x*_t
+
+/// Floating-point multiply extended
+name = vmulx
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
+a = 1.
+b = 2., 0.
+n = 0
+validate 2.
+
+aarch64 = fmulx
+generate float64x1_t, float64x1_t:float64x2_t:float64x1_t
+
+/// Floating-point multiply extended
+name = vmulx
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+a = 1., 2., 3., 4.
+b = 2., 0., 0., 0.
+n = 0
+validate 2., 4., 6., 8.
+
+aarch64 = fmulx
+generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t
+generate float64x2_t:float64x1_t:float64x2_t, float64x2_t
+
+/// Floating-point multiply extended
+name = vmulx
+a = 2.
+b = 3.
+validate 6.
+
+aarch64 = fmulx
+link-aarch64 = fmulx._EXT_
+generate f32, f64
+
+/// Floating-point multiply extended
+name = vmulx
+lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
+multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32}
+
+a = 2.
+b = 3., 0., 0., 0.
+n = 0
+validate 6.
+
+aarch64 = fmulx
+generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64
+
 /// Floating-point fused Multiply-Add to accumulator(vector)
 name = vfma
 a = 2.0, 3.0, 4.0, 5.0
@ -2142,7 +2409,7 @@ generate uint32x4_t:u64
 name = vsubhn
 no-q
 multi_fn = fixed, c:in_t
-multi_fn = simd_cast, {simd_shr, {simd_sub}, transmute(c)}
+multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)}
 a = MAX, MIN, 1, 1, MAX, MIN, 1, 1
 b = 1, 0, 0, 0, 1, 0, 0, 0
 fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@ -81,7 +81,7 @@ fn type_len(t: &str) -> usize {
        "poly64x1_t" => 1,
        "poly64x2_t" => 2,
        "i8" | "i16" | "i32" | "i64" | "u8" | "u16" | "u32" | "u64" | "f32" | "f64" | "p8"
-        | "p16" => 1,
+        | "p16" | "p64" | "p128" => 1,
        _ => panic!("unknown type: {}", t),
    }
 }
@ -324,16 +324,16 @@ fn type_to_noq_suffix(t: &str) -> &str {
        "int16x4_t" | "int16x8_t" | "i16" => "_s16",
        "int32x2_t" | "int32x4_t" | "i32" => "_s32",
        "int64x1_t" | "int64x2_t" | "i64" => "_s64",
-        "uint8x8_t" | "uint8x16_t" => "_u8",
-        "uint16x4_t" | "uint16x8_t" => "_u16",
-        "uint32x2_t" | "uint32x4_t" => "_u32",
-        "uint64x1_t" | "uint64x2_t" => "_u64",
+        "uint8x8_t" | "uint8x16_t" | "u8" => "_u8",
+        "uint16x4_t" | "uint16x8_t" | "u16" => "_u16",
+        "uint32x2_t" | "uint32x4_t" | "u32" => "_u32",
+        "uint64x1_t" | "uint64x2_t" | "u64" => "_u64",
        "float16x4_t" | "float16x8_t" => "_f16",
        "float32x2_t" | "float32x4_t" => "_f32",
        "float64x1_t" | "float64x2_t" => "_f64",
        "poly8x8_t" | "poly8x16_t" => "_p8",
        "poly16x4_t" | "poly16x8_t" => "_p16",
-        "poly64x1_t" | "poly64x2_t" => "_p64",
+        "poly64x1_t" | "poly64x2_t" | "p64" => "_p64",
        _ => panic!("unknown type: {}", t),
    }
 }
@ -347,6 +347,7 @@ enum Suffix {
    NSuffix,
    NoQNSuffix,
    OutSuffix,
+    OutNSuffix,
    Lane,
    In2,
    In2Lane,
@ -354,8 +355,10 @@ enum Suffix {

 #[derive(Clone, Copy)]
 enum TargetFeature {
+    Default,
    ArmV7,
    FPArmV8,
+    Crypto,
 }

 fn type_to_global_type(t: &str) -> &str {
@ -400,6 +403,8 @@ fn type_to_global_type(t: &str) -> &str {
        "f64" => "f64",
        "p8" => "p8",
        "p16" => "p16",
+        "p64" => "p64",
+        "p128" => "p128",
        _ => panic!("unknown type: {}", t),
    }
 }
@ -492,6 +497,10 @@ fn type_to_ext(t: &str) -> &str {
        "u16" => "v4i16",
        "u32" => "v2i32",
        "u64" => "v1i64",
+        "f32" => "f32",
+        "f64" => "f64",
+        "p64" => "p64",
+        "p128" => "p128",
        /*
        "poly64x1_t" => "i64x1",
        "poly64x2_t" => "i64x2",
@ -825,6 +834,7 @@ fn gen_aarch64(
    )],
    suffix: Suffix,
    para_num: i32,
+    target: TargetFeature,
    fixed: &Vec<String>,
    multi_fn: &Vec<String>,
 ) -> (String, String) {
@ -846,16 +856,20 @@ fn gen_aarch64(
        NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])),
        NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])),
        OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
+        OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)),
        Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
        In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
        In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
    };
+    let current_target = match target {
+        Default => "neon",
+        ArmV7 => "v7",
+        FPArmV8 => "fp-armv8,v8",
+        Crypto => "neon,crypto",
+    };
    let current_fn = if let Some(current_fn) = current_fn.clone() {
        if link_aarch64.is_some() {
-            panic!(
-                "[{}] Can't specify link and (multi) fn at the same time.",
-                name
-            )
+            panic!("[{}] Can't specify link and fn at the same time.", name)
        }
        current_fn
    } else if link_aarch64.is_some() {
@ -872,7 +886,24 @@ fn gen_aarch64(
    let current_aarch64 = current_aarch64.clone().unwrap();
    let mut ext_c = String::new();
    let mut ext_c_const = String::new();
-    if let Some(link_aarch64) = link_aarch64.clone() {
+    let mut link_t: Vec<String> = vec![
+        in_t[0].to_string(),
+        in_t[1].to_string(),
+        in_t[2].to_string(),
+        out_t.to_string(),
+    ];
+    if let Some(mut link_aarch64) = link_aarch64.clone() {
+        if link_aarch64.contains(":") {
+            let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(links.len(), 5);
+            link_aarch64 = links[0].to_string();
+            link_t = vec![
+                links[1].clone(),
+                links[2].clone(),
+                links[3].clone(),
+                links[4].clone(),
+            ];
+        }
        let ext = type_to_ext(in_t[0]);
        let ext2 = type_to_ext(out_t);
        let link_aarch64 = if link_aarch64.starts_with("llvm") {
@ -893,17 +924,17 @@ fn gen_aarch64(
            current_fn,
            match para_num {
                1 => {
-                    format!("a: {}", in_t[0])
+                    format!("a: {}", link_t[0])
                }
                2 => {
-                    format!("a: {}, b: {}", in_t[0], in_t[1])
+                    format!("a: {}, b: {}", link_t[0], link_t[1])
                }
                3 => {
-                    format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2])
+                    format!("a: {}, b: {}, c: {}", link_t[0], link_t[1], link_t[2])
                }
                _ => unimplemented!("unknown para_num"),
            },
-            out_t
+            link_t[3]
        );
        if const_aarch64.is_some() {
            ext_c_const = format!(
@ -998,6 +1029,11 @@ fn gen_aarch64(
    } else {
        String::new()
    };
+    let trans: [&str; 2] = if link_t[3] != out_t {
+        ["transmute(", ")"]
+    } else {
+        ["", ""]
+    };
    let call = if let Some(const_aarch64) = const_aarch64 {
        match para_num {
            1 => format!(
@ -1033,16 +1069,16 @@ fn gen_aarch64(
        match (multi_calls.len(), para_num, fixed.len()) {
            (0, 1, 0) => format!(
                r#"pub unsafe fn {}{}(a: {}) -> {} {{
-    {}{}(a)
+    {}{}{}(a){}
 }}"#,
-                name, const_declare, in_t[0], out_t, ext_c, current_fn,
+                name, const_declare, in_t[0], out_t, ext_c, trans[0], current_fn, trans[1]
            ),
            (0, 1, _) => {
                let fixed: Vec<String> = fixed.iter().take(type_len(in_t[0])).cloned().collect();
                format!(
                    r#"pub unsafe fn {}{}(a: {}) -> {} {{
    let b{};
-    {}{}(a, transmute(b))
+    {}{}{}(a, transmute(b)){}
 }}"#,
                    name,
                    const_declare,
@ -1050,14 +1086,16 @@ fn gen_aarch64(
                    out_t,
                    values(in_t[0], &fixed),
                    ext_c,
+                    trans[0],
                    current_fn,
+                    trans[1],
                )
            }
            (0, 2, _) => format!(
                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
-    {}{}(a, b)
+    {}{}{}(a, b){}
 }}"#,
-                name, const_declare, in_t[0], in_t[1], out_t, ext_c, current_fn,
+                name, const_declare, in_t[0], in_t[1], out_t, ext_c, trans[0], current_fn, trans[1],
            ),
            (0, 3, _) => format!(
                r#"pub unsafe fn {}{}(a: {}, b: {}, c: {}) -> {} {{
@ -1090,11 +1128,11 @@ fn gen_aarch64(
        r#"
 {}
 #[inline]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "{}")]
 #[cfg_attr(test, assert_instr({}{}))]{}
 {}
 "#,
-        current_comment, current_aarch64, const_assert, const_legacy, call
+        current_comment, current_target, current_aarch64, const_assert, const_legacy, call
    );

    let test = gen_test(
@ -1259,6 +1297,7 @@ fn gen_arm(
        NSuffix => format!("{}{}", current_name, type_to_n_suffix(in_t[1])),
        NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])),
        OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
+        OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)),
        Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
        In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
        In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
@ -1266,10 +1305,17 @@ fn gen_arm(
    let current_aarch64 = current_aarch64
        .clone()
        .unwrap_or_else(|| current_arm.to_string());
-
-    let current_target = match target {
+    let current_target_aarch64 = match target {
+        Default => "neon",
+        ArmV7 => "neon",
+        FPArmV8 => "neon",
+        Crypto => "neon,crypto",
+    };
+    let current_target_arm = match target {
+        Default => "v7",
        ArmV7 => "v7",
        FPArmV8 => "fp-armv8,v8",
+        Crypto => "crypto,v8",
    };

    let current_fn = if let Some(current_fn) = current_fn.clone() {
@ -1292,9 +1338,57 @@ fn gen_arm(
        String::new()
    };
    let mut ext_c = String::new();
-    let mut ext_c_const_arm = String::new();
-    let mut ext_c_const_aarch64 = String::new();
-    if let (Some(link_arm), Some(link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) {
+    let mut ext_c_arm = if multi_fn.is_empty() {
+        String::new()
+    } else {
+        String::from(
+            r#"
+    "#,
+        )
+    };
+    let mut ext_c_aarch64 = if multi_fn.is_empty() {
+        String::new()
+    } else {
+        String::from(
+            r#"
+    "#,
+        )
+    };
+    let mut link_arm_t: Vec<String> = vec![
+        in_t[0].to_string(),
+        in_t[1].to_string(),
+        in_t[2].to_string(),
+        out_t.to_string(),
+    ];
+    let mut link_aarch64_t: Vec<String> = vec![
+        in_t[0].to_string(),
+        in_t[1].to_string(),
+        in_t[2].to_string(),
+        out_t.to_string(),
+    ];
+    if let (Some(mut link_arm), Some(mut link_aarch64)) = (link_arm.clone(), link_aarch64.clone()) {
+        if link_arm.contains(":") {
+            let links: Vec<_> = link_arm.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(links.len(), 5);
+            link_arm = links[0].to_string();
+            link_arm_t = vec![
+                links[1].clone(),
+                links[2].clone(),
+                links[3].clone(),
+                links[4].clone(),
+            ];
+        }
+        if link_aarch64.contains(":") {
+            let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(links.len(), 5);
+            link_aarch64 = links[0].to_string();
+            link_aarch64_t = vec![
+                links[1].clone(),
+                links[2].clone(),
+                links[3].clone(),
+                links[4].clone(),
+            ];
+        }
        let ext = type_to_ext(in_t[0]);
        let ext2 = type_to_ext(out_t);
        let link_arm = if link_arm.starts_with("llvm") {
@ -1311,35 +1405,36 @@ fn gen_arm(
            link.push_str(&link_aarch64);
            link.replace("_EXT_", ext).replace("_EXT2_", ext2)
        };
-        ext_c = format!(
-            r#"#[allow(improper_ctypes)]
+        if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] {
+            ext_c = format!(
+                r#"#[allow(improper_ctypes)]
    extern "C" {{
        #[cfg_attr(target_arch = "arm", link_name = "{}")]
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
    }}
 "#,
-            link_arm,
-            link_aarch64,
-            current_fn,
-            match para_num {
-                1 => {
-                    format!("a: {}", in_t[0])
-                }
-                2 => {
-                    format!("a: {}, b: {}", in_t[0], in_t[1])
-                }
-                3 => {
-                    format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2])
-                }
-                _ => unimplemented!("unknown para_num"),
-            },
-            out_t
-        );
+                link_arm,
+                link_aarch64,
+                current_fn,
+                match para_num {
+                    1 => {
+                        format!("a: {}", in_t[0])
+                    }
+                    2 => {
+                        format!("a: {}, b: {}", in_t[0], in_t[1])
+                    }
+                    3 => {
+                        format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2])
+                    }
+                    _ => unimplemented!("unknown para_num"),
+                },
+                out_t
+            );
+        };
        if const_arm.is_some() {
-            ext_c_const_arm = format!(
-                r#"
-    #[allow(improper_ctypes)]
+            ext_c_arm.push_str(&format!(
+                r#"#[allow(improper_ctypes)]
    extern "C" {{
        #[cfg_attr(target_arch = "arm", link_name = "{}")]
        fn {}({}) -> {};
@ -1363,12 +1458,39 @@ fn gen_arm(
                    _ => unimplemented!("unknown para_num"),
                },
                out_t
-            );
+            ));
+        };
+        if out_t != link_arm_t[3] {
+            ext_c_arm.push_str(&format!(
+                r#"#[allow(improper_ctypes)]
+    extern "C" {{
+        #[cfg_attr(target_arch = "arm", link_name = "{}")]
+        fn {}({}) -> {};
+    }}
+"#,
+                link_arm,
+                current_fn,
+                match para_num {
+                    1 => {
+                        format!("a: {}", link_arm_t[0])
+                    }
+                    2 => {
+                        format!("a: {}, b: {}", link_arm_t[0], link_arm_t[1])
+                    }
+                    3 => {
+                        format!(
+                            "a: {}, b: {}, c: {}",
+                            link_arm_t[0], link_arm_t[1], link_arm_t[2]
+                        )
+                    }
+                    _ => unimplemented!("unknown para_num"),
+                },
+                link_arm_t[3]
+            ));
        }
        if const_aarch64.is_some() {
-            ext_c_const_aarch64 = format!(
-                r#"
-    #[allow(improper_ctypes)]
+            ext_c_aarch64.push_str(&format!(
+                r#"#[allow(improper_ctypes)]
    extern "C" {{
        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
        fn {}({}) -> {};
@ -1389,7 +1511,35 @@ fn gen_arm(
                    _ => unimplemented!("unknown para_num"),
                },
                out_t
-            );
+            ));
+        }
+        if out_t != link_aarch64_t[3] {
+            ext_c_aarch64.push_str(&format!(
+                r#"#[allow(improper_ctypes)]
+    extern "C" {{
+        #[cfg_attr(target_arch = "aarch64", link_name = "{}")]
+        fn {}({}) -> {};
+    }}
+"#,
+                link_aarch64,
+                current_fn,
+                match para_num {
+                    1 => {
+                        format!("a: {}", link_aarch64_t[0])
+                    }
+                    2 => {
+                        format!("a: {}, b: {}", link_aarch64_t[0], link_aarch64_t[1])
+                    }
+                    3 => {
+                        format!(
+                            "a: {}, b: {}, c: {}",
+                            link_aarch64_t[0], link_aarch64_t[1], link_aarch64_t[2]
+                        )
+                    }
+                    _ => unimplemented!("unknown para_num"),
+                },
+                link_aarch64_t[3]
+            ));
        }
    };
    let multi_calls = if !multi_fn.is_empty() {
@ -1430,6 +1580,11 @@ fn gen_arm(
    } else {
        String::new()
    };
+    let trans: [&str; 2] = if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] {
+        ["", ""]
+    } else {
+        ["transmute(", ")"]
+    };
    let call = match (multi_calls.len(), para_num, fixed.len()) {
        (0, 1, 0) => format!(
            r#"pub unsafe fn {}{}(a: {}) -> {} {{
@ -1485,7 +1640,7 @@ fn gen_arm(
        ),
        (_, _, _) => String::new(),
    };
-    let call_const_arm = if let Some(const_arm) = const_arm {
+    let call_arm = if let Some(const_arm) = const_arm {
        let const_arm = const_arm.replace("ttn", type_to_native_type(in_t[1]));
        let mut cnt = String::from(in_t[1]);
        cnt.push_str("(");
@ -1501,20 +1656,61 @@ fn gen_arm(
                r#"pub unsafe fn {}{}(a: {}) -> {} {{
    {}{}{}(a, {})
 }}"#,
-                name, const_declare, in_t[0], out_t, multi_calls, ext_c_const_arm, current_fn, cnt
+                name, const_declare, in_t[0], out_t, multi_calls, ext_c_arm, current_fn, cnt
            ),
            2 => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"pub unsafe fn {}{}(a: {}, b:{}) -> {} {{
    {}{}{}(a, b, {})
 }}"#,
-                name, const_declare, in_t[0], out_t, multi_calls, ext_c_const_arm, current_fn, cnt
+                name,
+                const_declare,
+                in_t[0],
+                in_t[1],
+                out_t,
+                multi_calls,
+                ext_c_arm,
+                current_fn,
+                cnt
+            ),
+            _ => String::new(),
+        }
+    } else if out_t != link_arm_t[3] {
+        match para_num {
+            1 => format!(
+                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+    {}{}{}{}(a){}
+}}"#,
+                name,
+                const_declare,
+                in_t[0],
+                out_t,
+                multi_calls,
+                ext_c_arm,
+                trans[0],
+                current_fn,
+                trans[1]
+            ),
+            2 => format!(
+                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+    {}{}{}{}(transmute(a), transmute(b)){}
+}}"#,
+                name,
+                const_declare,
+                in_t[0],
+                in_t[1],
+                out_t,
+                multi_calls,
+                ext_c_arm,
+                trans[0],
+                current_fn,
+                trans[1],
            ),
            _ => String::new(),
        }
    } else {
        String::new()
    };
-    let call_const_aarch64 = if let Some(const_aarch64) = const_aarch64 {
+    let call_aarch64 = if let Some(const_aarch64) = const_aarch64 {
        match para_num {
            1 => format!(
                r#"pub unsafe fn {}{}(a: {}) -> {} {{
@ -1525,55 +1721,94 @@ fn gen_arm(
                in_t[0],
                out_t,
                multi_calls,
-                ext_c_const_aarch64,
+                ext_c_aarch64,
                current_fn,
                const_aarch64
            ),
            2 => format!(
-                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
    {}{}{}(a, b, {})
+}}"#,
+                name,
+                const_declare,
+                in_t[0],
+                in_t[1],
+                out_t,
+                multi_calls,
+                ext_c_aarch64,
+                current_fn,
+                const_aarch64
+            ),
+            _ => String::new(),
+        }
+    } else if out_t != link_aarch64_t[3] {
+        match para_num {
+            1 => format!(
+                r#"pub unsafe fn {}{}(a: {}) -> {} {{
+    {}{}{}{}(a){}
 }}"#,
                name,
                const_declare,
                in_t[0],
                out_t,
                multi_calls,
-                ext_c_const_aarch64,
+                ext_c_aarch64,
+                trans[0],
                current_fn,
-                const_aarch64
+                trans[1],
+            ),
+            2 => format!(
+                r#"pub unsafe fn {}{}(a: {}, b: {}) -> {} {{
+    {}{}{}{}(a, b){}
+}}"#,
+                name,
+                const_declare,
+                in_t[0],
+                in_t[1],
+                out_t,
+                multi_calls,
+                ext_c_aarch64,
+                trans[0],
+                current_fn,
+                trans[1],
            ),
            _ => String::new(),
        }
    } else {
        String::new()
    };
-    let function = if const_arm.is_some() && const_aarch64.is_some() {
+    let function = if (const_arm.is_some() && const_aarch64.is_some())
+        || out_t != link_arm_t[3]
+        || out_t != link_aarch64_t[3]
+    {
        format!(
            r#"
 {}
 #[inline]
 #[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
+#[target_feature(enable = "neon,{}")]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr({}{}))]{}
 {}

 {}
 #[inline]
 #[cfg(target_arch = "aarch64")]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "{}")]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({}{}))]{}
 {}
 "#,
            current_comment,
+            current_target_arm,
            expand_intrinsic(&current_arm, in_t[1]),
            const_assert,
            const_legacy,
-            call_const_arm,
+            call_arm,
            current_comment,
+            current_target_aarch64,
            expand_intrinsic(&current_aarch64, in_t[1]),
            const_assert,
            const_legacy,
-            call_const_aarch64,
+            call_aarch64,
        )
    } else {
        format!(
@ -1587,7 +1822,7 @@ fn gen_arm(
 {}
 "#,
            current_comment,
-            current_target,
+            current_target_arm,
            expand_intrinsic(&current_arm, in_t[1]),
            const_assert,
            expand_intrinsic(&current_aarch64, in_t[1]),
@ -1755,6 +1990,7 @@ fn get_call(
        let len = match &*fn_format[1] {
            "out_len" => type_len(out_t),
            "in_len" => type_len(in_t[1]),
+            "in0_len" => type_len(in_t[0]),
            "halflen" => type_len(in_t[1]) / 2,
            _ => 0,
        };
@ -2003,6 +2239,8 @@ fn get_call(
            fn_name.push_str(type_to_n_suffix(in_t[1]));
        } else if fn_format[1] == "out" {
            fn_name.push_str(type_to_suffix(out_t));
+        } else if fn_format[1] == "in0" {
+            fn_name.push_str(type_to_suffix(in_t[0]));
        } else if fn_format[1] == "in2" {
            fn_name.push_str(type_to_suffix(in_t[2]));
        } else if fn_format[1] == "signed" {
@ -2028,6 +2266,8 @@ fn get_call(
            fn_name.push_str(&(type_len(in_t[1]) / 2).to_string());
        } else if fn_format[1] == "nout" {
            fn_name.push_str(type_to_n_suffix(out_t));
+        } else if fn_format[1] == "nin0" {
+            fn_name.push_str(type_to_n_suffix(in_t[0]));
        } else if fn_format[1] == "nsigned" {
            fn_name.push_str(type_to_n_suffix(type_to_signed(in_t[1])));
        } else if fn_format[1] == "in_ntt" {
@ -2063,7 +2303,7 @@ fn get_call(
        }
    }
    if param_str.is_empty() {
-        param_str.push_str("a, b");
+        return fn_name;
    }
    let fn_str = if let Some((re_name, re_type)) = re.clone() {
        format!(
@ -2108,7 +2348,7 @@ fn main() -> io::Result<()> {
        Vec<String>,
    )> = Vec::new();
    let mut multi_fn: Vec<String> = Vec::new();
-    let mut target: TargetFeature = ArmV7;
+    let mut target: TargetFeature = Default;

    //
    // THIS FILE IS GENERATED FORM neon.spec DO NOT CHANGE IT MANUALLY
@ -2189,7 +2429,7 @@ mod test {
            fixed = Vec::new();
            n = None;
            multi_fn = Vec::new();
-            target = ArmV7;
+            target = Default;
        } else if line.starts_with("//") {
        } else if line.starts_with("name = ") {
            current_name = Some(String::from(&line[7..]));
@ -2211,6 +2451,8 @@ mod test {
            suffix = NoQDouble;
        } else if line.starts_with("n-suffix") {
            suffix = NSuffix;
+        } else if line.starts_with("out-n-suffix") {
+            suffix = OutNSuffix;
        } else if line.starts_with("noq-n-suffix") {
            suffix = NoQNSuffix;
        } else if line.starts_with("out-suffix") {
@ -2245,10 +2487,12 @@ mod test {
        } else if line.starts_with("target = ") {
            target = match Some(String::from(&line[9..])) {
                Some(input) => match input.as_str() {
+                    "v7" => ArmV7,
                    "fp-armv8" => FPArmV8,
-                    _ => ArmV7,
+                    "crypto" => Crypto,
+                    _ => Default,
                },
-                _ => ArmV7,
+                _ => Default,
            }
        } else if line.starts_with("generate ") {
            let line = &line[9..];
@ -2328,6 +2572,7 @@ mod test {
                        &current_tests,
                        suffix,
                        para_num,
+                        target,
                        &fixed,
                        &multi_fn,
                    );