manually const-ify shuffle arguments (#1160)

2021-05-11 22:11:52 +02:00 · 2021-05-11 22:11:52 +02:00 · a34883b5d3
commit a34883b5d3
parent 7516a80c31
17 changed files with 1655 additions and 1549 deletions
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@ -1595,7 +1595,7 @@ pub unsafe fn vext_f64<const N: i32>(a: float64x1_t, _b: float64x1_t) -> float64
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
-    simd_shuffle16(
+    simd_shuffle16!(
        low,
        high,
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -1607,7 +1607,7 @@ pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Vector combine
@ -1615,7 +1615,7 @@ pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
-    simd_shuffle4(low, high, [0, 1, 2, 3])
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
 }

 /// Vector combine
@ -1623,7 +1623,7 @@ pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Vector combine
@ -1631,7 +1631,7 @@ pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
-    simd_shuffle16(
+    simd_shuffle16!(
        low,
        high,
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -1643,7 +1643,7 @@ pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Vector combine
@ -1651,7 +1651,7 @@ pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
-    simd_shuffle4(low, high, [0, 1, 2, 3])
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
 }

 /// Vector combine
@ -1659,7 +1659,7 @@ pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Vector combine
@ -1667,7 +1667,7 @@ pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_p64(low: poly64x1_t, high: poly64x1_t) -> poly64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@ -1772,7 +1772,7 @@ pub unsafe fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_f16 ( low: float16x4_t,  high: float16x4_t) -> float16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 */

@ -1781,7 +1781,7 @@ pub unsafe fn vcombine_f16 ( low: float16x4_t,  high: float16x4_t) -> float16x8_
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
-    simd_shuffle4(low, high, [0, 1, 2, 3])
+    simd_shuffle4!(low, high, [0, 1, 2, 3])
 }

 /// Vector combine
@ -1789,7 +1789,7 @@ pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
-    simd_shuffle16(
+    simd_shuffle16!(
        low,
        high,
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -1801,7 +1801,7 @@ pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
-    simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Vector combine
@ -1809,7 +1809,7 @@ pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(mov))]
 pub unsafe fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t {
-    simd_shuffle2(low, high, [0, 1])
+    simd_shuffle2!(low, high, [0, 1])
 }

 /// Table look-up
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
@ -580,7 +580,7 @@ pub unsafe fn vld1q_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x4_t)
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
    let x = vld1_lane_s8::<0>(ptr, transmute(i8x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -591,7 +591,7 @@ pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
    let x = vld1q_lane_s8::<0>(ptr, transmute(i8x16::splat(0)));
-    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -602,7 +602,7 @@ pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
    let x = vld1_lane_s16::<0>(ptr, transmute(i16x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -613,7 +613,7 @@ pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
    let x = vld1q_lane_s16::<0>(ptr, transmute(i16x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -624,7 +624,7 @@ pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
    let x = vld1_lane_s32::<0>(ptr, transmute(i32x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -635,7 +635,7 @@ pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
    let x = vld1q_lane_s32::<0>(ptr, transmute(i32x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -663,7 +663,7 @@ pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
    let x = vld1q_lane_s64::<0>(ptr, transmute(i64x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -674,7 +674,7 @@ pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
    let x = vld1_lane_u8::<0>(ptr, transmute(u8x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -685,7 +685,7 @@ pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
    let x = vld1q_lane_u8::<0>(ptr, transmute(u8x16::splat(0)));
-    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -696,7 +696,7 @@ pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
    let x = vld1_lane_u16::<0>(ptr, transmute(u16x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -707,7 +707,7 @@ pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
    let x = vld1q_lane_u16::<0>(ptr, transmute(u16x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -718,7 +718,7 @@ pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
    let x = vld1_lane_u32::<0>(ptr, transmute(u32x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -729,7 +729,7 @@ pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
    let x = vld1q_lane_u32::<0>(ptr, transmute(u32x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -757,7 +757,7 @@ pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
    let x = vld1q_lane_u64::<0>(ptr, transmute(u64x2::splat(0)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -768,7 +768,7 @@ pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
    let x = vld1_lane_p8::<0>(ptr, transmute(u8x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -779,7 +779,7 @@ pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
    let x = vld1q_lane_p8::<0>(ptr, transmute(u8x16::splat(0)));
-    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -790,7 +790,7 @@ pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
    let x = vld1_lane_p16::<0>(ptr, transmute(u16x4::splat(0)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -801,7 +801,7 @@ pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
    let x = vld1q_lane_p16::<0>(ptr, transmute(u16x8::splat(0)));
-    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -812,7 +812,7 @@ pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
    let x = vld1_lane_f32::<0>(ptr, transmute(f32x2::splat(0.)));
-    simd_shuffle2(x, x, [0, 0])
+    simd_shuffle2!(x, x, [0, 0])
 }

 /// Load one single-element structure and Replicate to all lanes (of one register).
@ -823,7 +823,7 @@ pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
 pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
    let x = vld1q_lane_f32::<0>(ptr, transmute(f32x4::splat(0.)));
-    simd_shuffle4(x, x, [0, 0, 0, 0])
+    simd_shuffle4!(x, x, [0, 0, 0, 0])
 }

 // signed absolute difference and accumulate (64-bit)
@ -1284,8 +1284,8 @@ pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
 pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
-    let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let a: int16x8_t = simd_cast(a);
    let b: int16x8_t = simd_cast(b);
    simd_add(a, b)
@ -1298,8 +1298,8 @@ pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
 pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
-    let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let a: int32x4_t = simd_cast(a);
    let b: int32x4_t = simd_cast(b);
    simd_add(a, b)
@ -1312,8 +1312,8 @@ pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
 pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
-    let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let a: int64x2_t = simd_cast(a);
    let b: int64x2_t = simd_cast(b);
    simd_add(a, b)
@ -1326,8 +1326,8 @@ pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
 pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
-    let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let a: uint16x8_t = simd_cast(a);
    let b: uint16x8_t = simd_cast(b);
    simd_add(a, b)
@ -1340,8 +1340,8 @@ pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
 pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
-    let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let a: uint32x4_t = simd_cast(a);
    let b: uint32x4_t = simd_cast(b);
    simd_add(a, b)
@ -1354,8 +1354,8 @@ pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
 pub unsafe fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
-    let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let a: uint64x2_t = simd_cast(a);
    let b: uint64x2_t = simd_cast(b);
    simd_add(a, b)
@ -1434,7 +1434,7 @@ pub unsafe fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
 pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
-    let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let b: int16x8_t = simd_cast(b);
    simd_add(a, b)
 }
@ -1446,7 +1446,7 @@ pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
 pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
-    let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let b: int32x4_t = simd_cast(b);
    simd_add(a, b)
 }
@ -1458,7 +1458,7 @@ pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
 pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
-    let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let b: int64x2_t = simd_cast(b);
    simd_add(a, b)
 }
@ -1470,7 +1470,7 @@ pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
 pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
-    let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
    let b: uint16x8_t = simd_cast(b);
    simd_add(a, b)
 }
@ -1482,7 +1482,7 @@ pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
 pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
-    let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
    let b: uint32x4_t = simd_cast(b);
    simd_add(a, b)
 }
@ -1494,7 +1494,7 @@ pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
 pub unsafe fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
-    let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
+    let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
    let b: uint64x2_t = simd_cast(b);
    simd_add(a, b)
 }
@ -1567,7 +1567,7 @@ pub unsafe fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Add returning High Narrow (high half).
@ -1578,7 +1578,7 @@ pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x1
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)));
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Add returning High Narrow (high half).
@ -1589,7 +1589,7 @@ pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)));
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Add returning High Narrow (high half).
@ -1600,7 +1600,7 @@ pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Add returning High Narrow (high half).
@ -1611,7 +1611,7 @@ pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uin
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)));
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Add returning High Narrow (high half).
@ -1622,7 +1622,7 @@ pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> ui
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
 pub unsafe fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)));
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Rounding Add returning High Narrow.
@ -1693,7 +1693,7 @@ pub unsafe fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
    let x = vraddhn_s16_(a, b);
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Rounding Add returning High Narrow (high half).
@ -1704,7 +1704,7 @@ pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
    let x = vraddhn_s32_(a, b);
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Rounding Add returning High Narrow (high half).
@ -1715,7 +1715,7 @@ pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int1
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
    let x = vraddhn_s64_(a, b);
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Rounding Add returning High Narrow (high half).
@ -1726,7 +1726,7 @@ pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int3
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
    let x: uint8x8_t = transmute(vraddhn_s16_(transmute(a), transmute(b)));
-    simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Rounding Add returning High Narrow (high half).
@ -1737,7 +1737,7 @@ pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> ui
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
    let x: uint16x4_t = transmute(vraddhn_s32_(transmute(a), transmute(b)));
-    simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Rounding Add returning High Narrow (high half).
@ -1748,7 +1748,7 @@ pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> u
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
 pub unsafe fn vraddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
    let x: uint32x2_t = transmute(vraddhn_s64_(transmute(a), transmute(b)));
-    simd_shuffle4(r, x, [0, 1, 2, 3])
+    simd_shuffle4!(r, x, [0, 1, 2, 3])
 }

 /// Signed Add Long Pairwise.
@ -2961,7 +2961,7 @@ pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
-    simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Duplicate vector element to vector or scalar
@ -2971,7 +2971,7 @@ pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
-    simd_shuffle4(a, a, [4, 5, 6, 7])
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@ -2981,7 +2981,7 @@ pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_s32(a: int32x4_t) -> int32x2_t {
-    simd_shuffle2(a, a, [2, 3])
+    simd_shuffle2!(a, a, [2, 3])
 }

 /// Duplicate vector element to vector or scalar
@ -3001,7 +3001,7 @@ pub unsafe fn vget_high_s64(a: int64x2_t) -> int64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Duplicate vector element to vector or scalar
@ -3011,7 +3011,7 @@ pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [4, 5, 6, 7])
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@ -3021,7 +3021,7 @@ pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_u32(a: uint32x4_t) -> uint32x2_t {
-    simd_shuffle2(a, a, [2, 3])
+    simd_shuffle2!(a, a, [2, 3])
 }

 /// Duplicate vector element to vector or scalar
@ -3041,7 +3041,7 @@ pub unsafe fn vget_high_u64(a: uint64x2_t) -> uint64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
+    simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
 }

 /// Duplicate vector element to vector or scalar
@ -3051,7 +3051,7 @@ pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [4, 5, 6, 7])
+    simd_shuffle4!(a, a, [4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@ -3061,7 +3061,7 @@ pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
 pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
-    simd_shuffle2(a, a, [2, 3])
+    simd_shuffle2!(a, a, [2, 3])
 }

 /// Duplicate vector element to vector or scalar
@ -3071,7 +3071,7 @@ pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@ -3081,7 +3081,7 @@ pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Duplicate vector element to vector or scalar
@ -3091,7 +3091,7 @@ pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@ -3111,7 +3111,7 @@ pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@ -3121,7 +3121,7 @@ pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Duplicate vector element to vector or scalar
@ -3131,7 +3131,7 @@ pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@ -3151,7 +3151,7 @@ pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Duplicate vector element to vector or scalar
@ -3161,7 +3161,7 @@ pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Duplicate vector element to vector or scalar
@ -3171,7 +3171,7 @@ pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vget_low_f32(a: float32x4_t) -> float32x2_t {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Duplicate vector element to vector or scalar
@ -3713,7 +3713,7 @@ pub unsafe fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@ -3723,7 +3723,7 @@ pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
 }

 /// Reversing vector elements (swap endianness)
@ -3733,7 +3733,7 @@ pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@ -3743,7 +3743,7 @@ pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
 }

 /// Reversing vector elements (swap endianness)
@ -3753,7 +3753,7 @@ pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@ -3763,7 +3763,7 @@ pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
 pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
+    simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
 }

 /// Reversing vector elements (swap endianness)
@ -3773,7 +3773,7 @@ pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@ -3783,7 +3783,7 @@ pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

 /// Reversing vector elements (swap endianness)
@ -3793,7 +3793,7 @@ pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@ -3803,7 +3803,7 @@ pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

 /// Reversing vector elements (swap endianness)
@ -3813,7 +3813,7 @@ pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@ -3823,7 +3823,7 @@ pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@ -3833,7 +3833,7 @@ pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@ -3843,7 +3843,7 @@ pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@ -3853,7 +3853,7 @@ pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@ -3863,7 +3863,7 @@ pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
+    simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
 }

 /// Reversing vector elements (swap endianness)
@ -3873,7 +3873,7 @@ pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@ -3883,7 +3883,7 @@ pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
 pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
+    simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
 }

 /// Reversing vector elements (swap endianness)
@ -3893,7 +3893,7 @@ pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
-    simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -3903,7 +3903,7 @@ pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
-    simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
 }

 /// Reversing vector elements (swap endianness)
@ -3913,7 +3913,7 @@ pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -3923,7 +3923,7 @@ pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@ -3933,7 +3933,7 @@ pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -3943,7 +3943,7 @@ pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@ -3953,7 +3953,7 @@ pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -3963,7 +3963,7 @@ pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
 }

 /// Reversing vector elements (swap endianness)
@ -3973,7 +3973,7 @@ pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -3983,7 +3983,7 @@ pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Reversing vector elements (swap endianness)
@ -3993,7 +3993,7 @@ pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -4003,7 +4003,7 @@ pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@ -4013,7 +4013,7 @@ pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -4023,7 +4023,7 @@ pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
-    simd_shuffle4(a, a, [1, 0, 3, 2])
+    simd_shuffle4!(a, a, [1, 0, 3, 2])
 }

 /// Reversing vector elements (swap endianness)
@ -4033,7 +4033,7 @@ pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
-    simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
+    simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -4043,7 +4043,7 @@ pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
-    simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
+    simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
 }

 /// Reversing vector elements (swap endianness)
@ -4053,7 +4053,7 @@ pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Reversing vector elements (swap endianness)
@ -4063,7 +4063,7 @@ pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
 pub unsafe fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t {
-    simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
+    simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }

 /// Signed Add and Accumulate Long Pairwise.
--- a/library/stdarch/crates/core_arch/src/macros.rs
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@ -92,3 +92,99 @@ macro_rules! types {
        pub struct $name($($fields)*);
    )*)
 }
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle2 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 2] = $idx;
+        }
+
+        simd_shuffle2($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 2] = $idx;
+        simd_shuffle2($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle4 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 4] = $idx;
+        }
+
+        simd_shuffle4($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 4] = $idx;
+        simd_shuffle4($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle8 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 8] = $idx;
+        }
+
+        simd_shuffle8($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 8] = $idx;
+        simd_shuffle8($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle16 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 16] = $idx;
+        }
+
+        simd_shuffle16($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 16] = $idx;
+        simd_shuffle16($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle32 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 32] = $idx;
+        }
+
+        simd_shuffle32($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 32] = $idx;
+        simd_shuffle32($x, $y, IDX)
+    }};
+}
+
+#[allow(unused_macros)]
+macro_rules! simd_shuffle64 {
+    ($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
+        struct ConstParam<$(const $imm: $ty),+>;
+        impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
+            const IDX: [u32; 64] = $idx;
+        }
+
+        simd_shuffle64($x, $y, ConstParam::<$($imm),+>::IDX)
+    }};
+    ($x:expr, $y:expr, $idx:expr $(,)?) => {{
+        const IDX: [u32; 64] = $idx;
+        simd_shuffle64($x, $y, IDX)
+    }};
+}
--- a/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
+++ b/library/stdarch/crates/core_arch/src/powerpc/vsx.rs
@ -47,10 +47,10 @@ mod sealed {
    #[cfg_attr(all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0))]
    unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
        match dm & 0b11 {
-            0 => simd_shuffle2(a, b, [0b00, 0b10]),
-            1 => simd_shuffle2(a, b, [0b01, 0b10]),
-            2 => simd_shuffle2(a, b, [0b00, 0b11]),
-            _ => simd_shuffle2(a, b, [0b01, 0b11]),
+            0 => simd_shuffle2!(a, b, [0b00, 0b10]),
+            1 => simd_shuffle2!(a, b, [0b01, 0b10]),
+            2 => simd_shuffle2!(a, b, [0b00, 0b11]),
+            _ => simd_shuffle2!(a, b, [0b01, 0b11]),
        }
    }

--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@ -118,10 +118,10 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1) + 4,
            ((MASK as u32 >> 2) & 0b1) + 2,
@ -141,10 +141,10 @@ pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m2
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 8,
@ -463,10 +463,10 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
    static_assert_imm4!(IMM4);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        b,
-        [
+        <const IMM4: i32> [
            ((IMM4 as u32 >> 0) & 1) * 4 + 0,
            ((IMM4 as u32 >> 1) & 1) * 4 + 1,
            ((IMM4 as u32 >> 2) & 1) * 4 + 2,
@ -486,10 +486,10 @@ pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
    static_assert_imm8!(IMM8);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        b,
-        [
+        <const IMM8: i32> [
            ((IMM8 as u32 >> 0) & 1) * 8 + 0,
            ((IMM8 as u32 >> 1) & 1) * 8 + 1,
            ((IMM8 as u32 >> 2) & 1) * 8 + 2,
@ -930,10 +930,10 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
    static_assert_imm1!(IMM1);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_undefined_ps(),
-        [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+        <const IMM1: i32> [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
    )
 }

@ -951,7 +951,7 @@ pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
    static_assert_imm1!(IMM1);
-    simd_shuffle2(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
+    simd_shuffle2!(a, _mm256_undefined_pd(), <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize])
 }

 /// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@ -967,10 +967,10 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_imm1!(IMM1);
-    let dst: i64x2 = simd_shuffle2(
+    let dst: i64x2 = simd_shuffle2!(
        a.as_i64x4(),
        _mm256_undefined_si256().as_i64x4(),
-        [[0, 1], [2, 3]][IMM1 as usize],
+        <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize],
    );
    transmute(dst)
 }
@ -1033,10 +1033,10 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
    static_assert_imm8!(IMM8);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        _mm256_undefined_ps(),
-        [
+        <const IMM8: i32> [
            (IMM8 as u32 >> 0) & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -1060,10 +1060,10 @@ pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
    static_assert_imm8!(IMM8);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm_undefined_ps(),
-        [
+        <const IMM8: i32> [
            (IMM8 as u32 >> 0) & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -1107,10 +1107,10 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
    static_assert_imm4!(IMM4);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_undefined_pd(),
-        [
+        <const IMM4: i32> [
            ((IMM4 as u32 >> 0) & 1),
            ((IMM4 as u32 >> 1) & 1),
            ((IMM4 as u32 >> 2) & 1) + 2,
@ -1130,10 +1130,10 @@ pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
    static_assert_imm2!(IMM2);
-    simd_shuffle2(
+    simd_shuffle2!(
        a,
        _mm_undefined_pd(),
-        [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        <const IMM2: i32> [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
    )
 }

@ -1257,10 +1257,10 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
    static_assert_imm1!(IMM1);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        _mm256_castps128_ps256(b),
-        [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+        <const IMM1: i32> [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
    )
 }

@ -1279,10 +1279,10 @@ pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
    static_assert_imm1!(IMM1);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_castpd128_pd256(b),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
    )
 }

@ -1300,10 +1300,10 @@ pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> _
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
    static_assert_imm1!(IMM1);
-    let dst: i64x4 = simd_shuffle4(
+    let dst: i64x4 = simd_shuffle4!(
        a.as_i64x4(),
        _mm256_castsi128_si256(b).as_i64x4(),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
    );
    transmute(dst)
 }
@ -1639,7 +1639,7 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
 #[cfg_attr(test, assert_instr(vmovshdup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
-    simd_shuffle8(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
+    simd_shuffle8!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
 }

 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
@ -1651,7 +1651,7 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vmovsldup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
-    simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
+    simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
 }

 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
@ -1663,7 +1663,7 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
-    simd_shuffle4(a, a, [0, 0, 2, 2])
+    simd_shuffle4!(a, a, [0, 0, 2, 2])
 }

 /// Loads 256-bits of integer data from unaligned memory into result.
@ -1756,7 +1756,7 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_shuffle4(a, b, [1, 5, 3, 7])
+    simd_shuffle4!(a, b, [1, 5, 3, 7])
 }

 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@ -1768,7 +1768,7 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vunpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
-    simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
+    simd_shuffle8!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
 }

 /// Unpacks and interleave double-precision (64-bit) floating-point elements
@ -1780,7 +1780,7 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
-    simd_shuffle4(a, b, [0, 4, 2, 6])
+    simd_shuffle4!(a, b, [0, 4, 2, 6])
 }

 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@ -1792,7 +1792,7 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
-    simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
+    simd_shuffle8!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
 }

 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@ -2572,7 +2572,7 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Casts vector of type __m256d to type __m128d.
@ -2584,7 +2584,7 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Casts vector of type __m256i to type __m128i.
@ -2597,7 +2597,7 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
    let a = a.as_i64x4();
-    let dst: i64x2 = simd_shuffle2(a, a, [0, 1]);
+    let dst: i64x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(dst)
 }

@ -2611,8 +2611,8 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
-    // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
+    // FIXME simd_shuffle8!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
 }

 /// Casts vector of type __m128d to type __m256d;
@ -2625,8 +2625,8 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
-    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
-    simd_shuffle4(a, a, [0, 1, 0, 0])
+    // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
+    simd_shuffle4!(a, a, [0, 1, 0, 0])
 }

 /// Casts vector of type __m128i to type __m256i;
@ -2640,8 +2640,8 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
    let a = a.as_i64x2();
-    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
-    let dst: i64x4 = simd_shuffle4(a, a, [0, 1, 0, 0]);
+    // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
+    let dst: i64x4 = simd_shuffle4!(a, a, [0, 1, 0, 0]);
    transmute(dst)
 }

@ -2656,7 +2656,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
-    simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
@ -2671,7 +2671,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
    let b = _mm_setzero_si128().as_i64x2();
-    let dst: i64x4 = simd_shuffle4(a.as_i64x2(), b, [0, 1, 2, 3]);
+    let dst: i64x4 = simd_shuffle4!(a.as_i64x2(), b, [0, 1, 2, 3]);
    transmute(dst)
 }

@ -2687,7 +2687,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
-    simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
+    simd_shuffle4!(a, _mm_setzero_pd(), [0, 1, 2, 3])
 }

 /// Returns vector of type `__m256` with undefined elements.
@ -2732,7 +2732,7 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i {
 #[cfg_attr(test, assert_instr(vinsertf128))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
-    simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Sets packed __m256d returned vector with the supplied values.
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@ -175,7 +175,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    let b = b.as_i8x32();

    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle32(
+        0 => simd_shuffle32!(
            b,
            a,
            [
@ -183,7 +183,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                23, 24, 25, 26, 27, 28, 29, 30, 31,
            ],
        ),
-        1 => simd_shuffle32(
+        1 => simd_shuffle32!(
            b,
            a,
            [
@ -191,7 +191,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                24, 25, 26, 27, 28, 29, 30, 31, 48,
            ],
        ),
-        2 => simd_shuffle32(
+        2 => simd_shuffle32!(
            b,
            a,
            [
@ -199,7 +199,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                25, 26, 27, 28, 29, 30, 31, 48, 49,
            ],
        ),
-        3 => simd_shuffle32(
+        3 => simd_shuffle32!(
            b,
            a,
            [
@ -207,7 +207,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
            ],
        ),
-        4 => simd_shuffle32(
+        4 => simd_shuffle32!(
            b,
            a,
            [
@ -215,7 +215,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
            ],
        ),
-        5 => simd_shuffle32(
+        5 => simd_shuffle32!(
            b,
            a,
            [
@ -223,7 +223,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
            ],
        ),
-        6 => simd_shuffle32(
+        6 => simd_shuffle32!(
            b,
            a,
            [
@ -231,7 +231,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
            ],
        ),
-        7 => simd_shuffle32(
+        7 => simd_shuffle32!(
            b,
            a,
            [
@ -239,7 +239,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
            ],
        ),
-        8 => simd_shuffle32(
+        8 => simd_shuffle32!(
            b,
            a,
            [
@ -247,7 +247,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
            ],
        ),
-        9 => simd_shuffle32(
+        9 => simd_shuffle32!(
            b,
            a,
            [
@ -255,7 +255,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
            ],
        ),
-        10 => simd_shuffle32(
+        10 => simd_shuffle32!(
            b,
            a,
            [
@ -263,7 +263,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
            ],
        ),
-        11 => simd_shuffle32(
+        11 => simd_shuffle32!(
            b,
            a,
            [
@ -271,7 +271,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
            ],
        ),
-        12 => simd_shuffle32(
+        12 => simd_shuffle32!(
            b,
            a,
            [
@ -279,7 +279,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
            ],
        ),
-        13 => simd_shuffle32(
+        13 => simd_shuffle32!(
            b,
            a,
            [
@ -287,7 +287,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
            ],
        ),
-        14 => simd_shuffle32(
+        14 => simd_shuffle32!(
            b,
            a,
            [
@ -295,7 +295,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
            ],
        ),
-        15 => simd_shuffle32(
+        15 => simd_shuffle32!(
            b,
            a,
            [
@ -370,10 +370,10 @@ pub unsafe fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128
    static_assert_imm4!(IMM4);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
-    let r: i32x4 = simd_shuffle4(
+    let r: i32x4 = simd_shuffle4!(
        a,
        b,
-        [
+        <const IMM4: i32> [
            [0, 4, 0, 4][IMM4 as usize & 0b11],
            [1, 1, 5, 5][IMM4 as usize & 0b11],
            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
@ -395,10 +395,10 @@ pub unsafe fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const IMM8: i32> [
            [0, 8, 0, 8][IMM8 as usize & 0b11],
            [1, 1, 9, 9][IMM8 as usize & 0b11],
            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
@ -424,10 +424,11 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    static_assert_imm8!(IMM8);
    let a = a.as_i16x16();
    let b = b.as_i16x16();
-    let r: i16x16 = simd_shuffle16(
+
+    let r: i16x16 = simd_shuffle16!(
        a,
        b,
-        [
+        <const IMM8: i32> [
            [0, 16, 0, 16][IMM8 as usize & 0b11],
            [1, 1, 17, 17][IMM8 as usize & 0b11],
            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
@ -470,7 +471,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
+    let ret = simd_shuffle16!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
    transmute::<i8x16, _>(ret)
 }

@ -484,7 +485,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
+    let ret = simd_shuffle32!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
    transmute::<i8x32, _>(ret)
 }

@ -500,7 +501,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
+    let ret = simd_shuffle4!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
    transmute::<i32x4, _>(ret)
 }

@ -516,7 +517,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
+    let ret = simd_shuffle8!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
    transmute::<i32x8, _>(ret)
 }

@ -530,7 +531,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
-    let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+    let ret = simd_shuffle2!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
    transmute::<i64x2, _>(ret)
 }

@ -543,7 +544,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
-    let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+    let ret = simd_shuffle4!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
    transmute::<i64x4, _>(ret)
 }

@ -556,7 +557,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vmovddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
-    simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
+    simd_shuffle2!(a, _mm_setzero_pd(), [0_u32; 2])
 }

 /// Broadcasts the low double-precision (64-bit) floating-point element
@ -568,7 +569,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
-    simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
+    simd_shuffle4!(a, _mm_setzero_pd(), [0_u32; 4])
 }

 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
@ -582,7 +583,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    let ret = simd_shuffle4!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
    transmute::<i64x4, _>(ret)
 }

@ -595,7 +596,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
-    simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
+    simd_shuffle4!(a, _mm_setzero_ps(), [0_u32; 4])
 }

 /// Broadcasts the low single-precision (32-bit) floating-point element
@ -607,7 +608,7 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
-    simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
+    simd_shuffle8!(a, _mm_setzero_ps(), [0_u32; 8])
 }

 /// Broadcasts the low packed 16-bit integer from a to all elements of
@ -620,7 +621,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
+    let ret = simd_shuffle8!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
    transmute::<i16x8, _>(ret)
 }

@ -634,7 +635,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
+    let ret = simd_shuffle16!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
    transmute::<i16x16, _>(ret)
 }

@ -746,7 +747,7 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
    let a = a.as_i16x8();
-    let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v64: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v64))
 }

@ -781,7 +782,7 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
    let a = a.as_i8x16();
-    let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i32x8, _>(simd_cast(v64))
 }

@ -794,7 +795,7 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
    let a = a.as_i8x16();
-    let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v32: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v32))
 }

@ -820,7 +821,7 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
    let a = a.as_u16x8();
-    let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v64: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v64))
 }

@ -856,7 +857,7 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
    let a = a.as_u8x16();
-    let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i32x8, _>(simd_cast(v64))
 }

@ -870,7 +871,7 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
    let a = a.as_u8x16();
-    let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
+    let v32: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute::<i64x4, _>(simd_cast(v32))
 }

@ -889,7 +890,7 @@ pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_imm1!(IMM1);
    let a = a.as_i64x4();
    let b = _mm256_undefined_si256().as_i64x4();
-    let dst: i64x2 = simd_shuffle2(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+    let dst: i64x2 = simd_shuffle2!(a, b, <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize]);
    transmute(dst)
 }

@ -1711,7 +1712,8 @@ pub unsafe fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -
    static_assert_imm1!(IMM1);
    let a = a.as_i64x4();
    let b = _mm256_castsi128_si256(b).as_i64x4();
-    let dst: i64x4 = simd_shuffle4(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+    let dst: i64x4 =
+        simd_shuffle4!(a, b, <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
    transmute(dst)
 }

@ -2200,10 +2202,10 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
 pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let zero = _mm256_setzero_si256().as_i64x4();
-    let r: i64x4 = simd_shuffle4(
+    let r: i64x4 = simd_shuffle4!(
        a.as_i64x4(),
        zero,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -2237,10 +2239,10 @@ pub unsafe fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i)
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
    static_assert_imm8!(IMM8);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        _mm256_undefined_pd(),
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -2350,10 +2352,10 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 = simd_shuffle8!(
        a.as_i32x8(),
        a.as_i32x8(),
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            (MASK as u32 >> 4) & 0b11,
@ -2380,10 +2382,10 @@ pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
 pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x16();
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0,
            1,
            2,
@ -2418,10 +2420,10 @@ pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 pub unsafe fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x16();
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0 + (IMM8 as u32 & 0b11),
            0 + ((IMM8 as u32 >> 2) & 0b11),
            0 + ((IMM8 as u32 >> 4) & 0b11),
@ -2585,10 +2587,10 @@ pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i8x32();
    let zero = _mm256_setzero_si256().as_i8x32();
-    let r: i8x32 = simd_shuffle32(
+    let r: i8x32 = simd_shuffle32!(
        zero,
        a,
-        [
+        <const IMM8: i32> [
            32 - (IMM8 as u32 & 0xff),
            33 - (IMM8 as u32 & 0xff),
            34 - (IMM8 as u32 & 0xff),
@ -2780,7 +2782,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
    let a = a.as_i8x32();
    let zero = _mm256_setzero_si256().as_i8x32();
    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle32(
+        0 => simd_shuffle32!(
            a,
            zero,
            [
@ -2788,7 +2790,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                23, 24, 25, 26, 27, 28, 29, 30, 31,
            ],
        ),
-        1 => simd_shuffle32(
+        1 => simd_shuffle32!(
            a,
            zero,
            [
@ -2796,7 +2798,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                24, 25, 26, 27, 28, 29, 30, 31, 32,
            ],
        ),
-        2 => simd_shuffle32(
+        2 => simd_shuffle32!(
            a,
            zero,
            [
@ -2804,7 +2806,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                25, 26, 27, 28, 29, 30, 31, 32, 32,
            ],
        ),
-        3 => simd_shuffle32(
+        3 => simd_shuffle32!(
            a,
            zero,
            [
@ -2812,7 +2814,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
            ],
        ),
-        4 => simd_shuffle32(
+        4 => simd_shuffle32!(
            a,
            zero,
            [
@ -2820,7 +2822,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
            ],
        ),
-        5 => simd_shuffle32(
+        5 => simd_shuffle32!(
            a,
            zero,
            [
@ -2828,7 +2830,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
            ],
        ),
-        6 => simd_shuffle32(
+        6 => simd_shuffle32!(
            a,
            zero,
            [
@ -2836,7 +2838,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        7 => simd_shuffle32(
+        7 => simd_shuffle32!(
            a,
            zero,
            [
@ -2844,7 +2846,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        8 => simd_shuffle32(
+        8 => simd_shuffle32!(
            a,
            zero,
            [
@ -2852,7 +2854,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        9 => simd_shuffle32(
+        9 => simd_shuffle32!(
            a,
            zero,
            [
@ -2860,7 +2862,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        10 => simd_shuffle32(
+        10 => simd_shuffle32!(
            a,
            zero,
            [
@ -2868,7 +2870,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        11 => simd_shuffle32(
+        11 => simd_shuffle32!(
            a,
            zero,
            [
@ -2876,7 +2878,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        12 => simd_shuffle32(
+        12 => simd_shuffle32!(
            a,
            zero,
            [
@ -2884,7 +2886,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        13 => simd_shuffle32(
+        13 => simd_shuffle32!(
            a,
            zero,
            [
@ -2892,7 +2894,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        14 => simd_shuffle32(
+        14 => simd_shuffle32!(
            a,
            zero,
            [
@ -2900,7 +2902,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
            ],
        ),
-        15 => simd_shuffle32(
+        15 => simd_shuffle32!(
            a,
            zero,
            [
@ -3178,7 +3180,7 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
+    let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
            8, 40, 9, 41, 10, 42, 11, 43,
            12, 44, 13, 45, 14, 46, 15, 47,
            24, 56, 25, 57, 26, 58, 27, 59,
@ -3231,7 +3233,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
+    let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
        0, 32, 1, 33, 2, 34, 3, 35,
        4, 36, 5, 37, 6, 38, 7, 39,
        16, 48, 17, 49, 18, 50, 19, 51,
@ -3279,7 +3281,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a.as_i16x16(),
        b.as_i16x16(),
        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
@ -3327,7 +3329,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle16(
+    let r: i16x16 = simd_shuffle16!(
        a.as_i16x16(),
        b.as_i16x16(),
        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
@ -3368,7 +3370,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+    let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
    transmute(r)
 }

@ -3405,7 +3407,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+    let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
    transmute(r)
 }

@ -3442,7 +3444,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+    let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
    transmute(r)
 }

@ -3479,7 +3481,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+    let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
    transmute(r)
 }

--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@ -6218,7 +6218,7 @@ pub unsafe fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i16x32();
-    let ret: i16x32 = simd_shuffle32(
+    let ret: i16x32 = simd_shuffle32!(
        a,
        a,
        [
@ -6306,7 +6306,7 @@ pub unsafe fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i8x64();
-    let ret: i8x64 = simd_shuffle64(
+    let ret: i8x64 = simd_shuffle64!(
        a,
        a,
        [
@ -6397,7 +6397,7 @@ pub unsafe fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i16x32();
    let b = b.as_i16x32();
    #[rustfmt::skip]
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        b,
        [
@ -6508,7 +6508,7 @@ pub unsafe fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i8x64();
    let b = b.as_i8x64();
    #[rustfmt::skip]
-    let r: i8x64 = simd_shuffle64(
+    let r: i8x64 = simd_shuffle64!(
        a,
        b,
        [
@ -6627,7 +6627,7 @@ pub unsafe fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i16x32();
    let b = b.as_i16x32();
    #[rustfmt::skip]
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        b,
        [
@ -6738,7 +6738,7 @@ pub unsafe fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i8x64();
    let b = b.as_i8x64();
    #[rustfmt::skip]
-    let r: i8x64 = simd_shuffle64(
+    let r: i8x64 = simd_shuffle64!(
        a,
        b,
        [
@ -7133,10 +7133,10 @@ pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
 pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x32();
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -7277,10 +7277,10 @@ pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i
 pub unsafe fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x32();
-    let r: i16x32 = simd_shuffle32(
+    let r: i16x32 = simd_shuffle32!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0,
            1,
            2,
@ -8433,7 +8433,7 @@ pub unsafe fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
 pub unsafe fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
    let zero = _mm_setzero_si128().as_i16x8();
-    let v256: i16x16 = simd_shuffle16(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
+    let v256: i16x16 = simd_shuffle16!(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
    transmute::<i8x16, _>(simd_cast(v256))
 }

@ -8875,10 +8875,10 @@ pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i8x64();
    let zero = _mm512_setzero_si512().as_i8x64();
-    let r: i8x64 = simd_shuffle64(
+    let r: i8x64 = simd_shuffle64!(
        zero,
        a,
-        [
+        <const IMM8: i32> [
            64 - (IMM8 as u32 & 0xff),
            65 - (IMM8 as u32 & 0xff),
            66 - (IMM8 as u32 & 0xff),
@ -8960,7 +8960,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
    let a = a.as_i8x64();
    let zero = _mm512_setzero_si512().as_i8x64();
    let r: i8x64 = match IMM8 % 16 {
-        0 => simd_shuffle64(
+        0 => simd_shuffle64!(
            a,
            zero,
            [
@ -8969,7 +8969,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
            ],
        ),
-        1 => simd_shuffle64(
+        1 => simd_shuffle64!(
            a,
            zero,
            [
@ -8978,7 +8978,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
            ],
        ),
-        2 => simd_shuffle64(
+        2 => simd_shuffle64!(
            a,
            zero,
            [
@ -8987,7 +8987,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
            ],
        ),
-        3 => simd_shuffle64(
+        3 => simd_shuffle64!(
            a,
            zero,
            [
@ -8997,7 +8997,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                114,
            ],
        ),
-        4 => simd_shuffle64(
+        4 => simd_shuffle64!(
            a,
            zero,
            [
@ -9007,7 +9007,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                115,
            ],
        ),
-        5 => simd_shuffle64(
+        5 => simd_shuffle64!(
            a,
            zero,
            [
@ -9017,7 +9017,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                115, 116,
            ],
        ),
-        6 => simd_shuffle64(
+        6 => simd_shuffle64!(
            a,
            zero,
            [
@ -9027,7 +9027,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                116, 117,
            ],
        ),
-        7 => simd_shuffle64(
+        7 => simd_shuffle64!(
            a,
            zero,
            [
@ -9037,7 +9037,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                116, 117, 118,
            ],
        ),
-        8 => simd_shuffle64(
+        8 => simd_shuffle64!(
            a,
            zero,
            [
@ -9047,7 +9047,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                116, 117, 118, 119,
            ],
        ),
-        9 => simd_shuffle64(
+        9 => simd_shuffle64!(
            a,
            zero,
            [
@ -9057,7 +9057,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                117, 118, 119, 120,
            ],
        ),
-        10 => simd_shuffle64(
+        10 => simd_shuffle64!(
            a,
            zero,
            [
@ -9067,7 +9067,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                118, 119, 120, 121,
            ],
        ),
-        11 => simd_shuffle64(
+        11 => simd_shuffle64!(
            a,
            zero,
            [
@ -9077,7 +9077,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                117, 118, 119, 120, 121, 122,
            ],
        ),
-        12 => simd_shuffle64(
+        12 => simd_shuffle64!(
            a,
            zero,
            [
@ -9087,7 +9087,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                118, 119, 120, 121, 122, 123,
            ],
        ),
-        13 => simd_shuffle64(
+        13 => simd_shuffle64!(
            a,
            zero,
            [
@ -9097,7 +9097,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                119, 120, 121, 122, 123, 124,
            ],
        ),
-        14 => simd_shuffle64(
+        14 => simd_shuffle64!(
            a,
            zero,
            [
@ -9107,7 +9107,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
                120, 121, 122, 123, 124, 125,
            ],
        ),
-        15 => simd_shuffle64(
+        15 => simd_shuffle64!(
            a,
            zero,
            [
@ -9146,7 +9146,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
    let b = b.as_i8x64();

    let r: i8x64 = match IMM8 % 16 {
-        0 => simd_shuffle64(
+        0 => simd_shuffle64!(
            b,
            a,
            [
@ -9155,7 +9155,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
            ],
        ),
-        1 => simd_shuffle64(
+        1 => simd_shuffle64!(
            b,
            a,
            [
@ -9164,7 +9164,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
            ],
        ),
-        2 => simd_shuffle64(
+        2 => simd_shuffle64!(
            b,
            a,
            [
@ -9173,7 +9173,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
            ],
        ),
-        3 => simd_shuffle64(
+        3 => simd_shuffle64!(
            b,
            a,
            [
@ -9183,7 +9183,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                114,
            ],
        ),
-        4 => simd_shuffle64(
+        4 => simd_shuffle64!(
            b,
            a,
            [
@ -9193,7 +9193,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                115,
            ],
        ),
-        5 => simd_shuffle64(
+        5 => simd_shuffle64!(
            b,
            a,
            [
@ -9203,7 +9203,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                115, 116,
            ],
        ),
-        6 => simd_shuffle64(
+        6 => simd_shuffle64!(
            b,
            a,
            [
@ -9213,7 +9213,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                116, 117,
            ],
        ),
-        7 => simd_shuffle64(
+        7 => simd_shuffle64!(
            b,
            a,
            [
@ -9223,7 +9223,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                116, 117, 118,
            ],
        ),
-        8 => simd_shuffle64(
+        8 => simd_shuffle64!(
            b,
            a,
            [
@ -9233,7 +9233,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                116, 117, 118, 119,
            ],
        ),
-        9 => simd_shuffle64(
+        9 => simd_shuffle64!(
            b,
            a,
            [
@ -9243,7 +9243,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                117, 118, 119, 120,
            ],
        ),
-        10 => simd_shuffle64(
+        10 => simd_shuffle64!(
            b,
            a,
            [
@ -9253,7 +9253,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                118, 119, 120, 121,
            ],
        ),
-        11 => simd_shuffle64(
+        11 => simd_shuffle64!(
            b,
            a,
            [
@ -9263,7 +9263,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                117, 118, 119, 120, 121, 122,
            ],
        ),
-        12 => simd_shuffle64(
+        12 => simd_shuffle64!(
            b,
            a,
            [
@ -9273,7 +9273,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                118, 119, 120, 121, 122, 123,
            ],
        ),
-        13 => simd_shuffle64(
+        13 => simd_shuffle64!(
            b,
            a,
            [
@ -9283,7 +9283,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                119, 120, 121, 122, 123, 124,
            ],
        ),
-        14 => simd_shuffle64(
+        14 => simd_shuffle64!(
            b,
            a,
            [
@ -9293,7 +9293,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                120, 121, 122, 123, 124, 125,
            ],
        ),
-        15 => simd_shuffle64(
+        15 => simd_shuffle64!(
            b,
            a,
            [
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@ -10529,7 +10529,7 @@ pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    );
-    simd_shuffle16(
+    simd_shuffle16!(
        r,
        _mm256_setzero_ps().as_f32x8(),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -10549,7 +10549,7 @@ pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> _
        k,
        _MM_FROUND_CUR_DIRECTION,
    );
-    simd_shuffle16(
+    simd_shuffle16!(
        r,
        _mm256_setzero_ps().as_f32x8(),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -10644,7 +10644,7 @@ pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
    let a = a.as_i8x16();
-    let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i64x8, _>(simd_cast(v64))
 }

@ -10805,7 +10805,7 @@ pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
    let a = a.as_u8x16();
-    let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i64x8, _>(simd_cast(v64))
 }

@ -11628,7 +11628,7 @@ pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
 pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
    let a = a.as_u32x4();
-    let u64: u32x2 = simd_shuffle2(a, a, [0, 1]);
+    let u64: u32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute::<f64x2, _>(simd_cast(u64))
 }

@ -11663,7 +11663,7 @@ pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
 pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
    let v2 = v2.as_i32x16();
-    let v256: i32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v256: i32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<f64x8, _>(simd_cast(v256))
 }

@ -11686,7 +11686,7 @@ pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i)
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
 pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
    let v2 = v2.as_u32x16();
-    let v256: u32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let v256: u32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<f64x8, _>(simd_cast(v256))
 }

@ -19215,10 +19215,10 @@ pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
    static_assert_imm8!(MASK);
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@ -19333,10 +19333,10 @@ pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> _
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1),
            ((MASK as u32 >> 2) & 0b1) + 2,
@ -19451,10 +19451,10 @@ pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) ->
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@ -19507,10 +19507,10 @@ pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m51
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@ -19559,10 +19559,10 @@ pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m25
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@ -19613,10 +19613,10 @@ pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d)
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        a,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
@ -20867,10 +20867,10 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a.as_i32x16(),
        a.as_i32x16(),
-        [
+        <const MASK: _MM_PERM_ENUM> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            (MASK as u32 >> 4) & 0b11,
@ -21003,10 +21003,10 @@ pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_imm8!(MASK);
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 16,
@ -21140,10 +21140,10 @@ pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: _
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
-    simd_shuffle8(
+    simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1) + 8,
            ((MASK as u32 >> 2) & 0b1) + 2,
@ -21275,10 +21275,10 @@ pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 4 + 0,
            (MASK as u32 & 0b11) * 4 + 1,
            (MASK as u32 & 0b11) * 4 + 2,
@ -21347,10 +21347,10 @@ pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
-    let r: i32x8 = simd_shuffle8(
+    let r: i32x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 4 + 0,
            (MASK as u32 & 0b1) * 4 + 1,
            (MASK as u32 & 0b1) * 4 + 2,
@ -21411,10 +21411,10 @@ pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
-    let r: i64x8 = simd_shuffle8(
+    let r: i64x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 2 + 0,
            (MASK as u32 & 0b11) * 2 + 1,
            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
@ -21475,10 +21475,10 @@ pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> _
    static_assert_imm8!(MASK);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
-    let r: i64x4 = simd_shuffle4(
+    let r: i64x4 = simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 2 + 0,
            (MASK as u32 & 0b1) * 2 + 1,
            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
@ -21535,10 +21535,10 @@ pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m
    static_assert_imm8!(MASK);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
-    let r: f32x16 = simd_shuffle16(
+    let r: f32x16 = simd_shuffle16!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 4 + 0,
            (MASK as u32 & 0b11) * 4 + 1,
            (MASK as u32 & 0b11) * 4 + 2,
@ -21607,10 +21607,10 @@ pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m
    static_assert_imm8!(MASK);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
-    let r: f32x8 = simd_shuffle8(
+    let r: f32x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 4 + 0,
            (MASK as u32 & 0b1) * 4 + 1,
            (MASK as u32 & 0b1) * 4 + 2,
@ -21671,10 +21671,10 @@ pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> _
    static_assert_imm8!(MASK);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
-    let r: f64x8 = simd_shuffle8(
+    let r: f64x8 = simd_shuffle8!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b11) * 2 + 0,
            (MASK as u32 & 0b11) * 2 + 1,
            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
@ -21735,10 +21735,10 @@ pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> _
    static_assert_imm8!(MASK);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
-    let r: f64x4 = simd_shuffle4(
+    let r: f64x4 = simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            (MASK as u32 & 0b1) * 2 + 0,
            (MASK as u32 & 0b1) * 2 + 1,
            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
@ -21797,10 +21797,10 @@ pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
 pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
    static_assert_imm2!(IMM8);
    match IMM8 & 0x3 {
-        0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+        0 => simd_shuffle4!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
+        2 => simd_shuffle4!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
+        _ => simd_shuffle4!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
    }
 }

@ -21854,8 +21854,8 @@ pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m5
 pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
    static_assert_imm1!(IMM8);
    match IMM8 & 0x1 {
-        0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
    }
 }

@ -21909,8 +21909,8 @@ pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m2
 pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
    static_assert_imm1!(IMM1);
    match IMM1 {
-        0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
    }
 }

@ -21964,8 +21964,8 @@ pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: _
 pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
    static_assert_imm1!(IMM8);
    match IMM8 & 0x1 {
-        0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
    }
 }

@ -22021,10 +22021,10 @@ pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i
    let a = a.as_i32x16();
    let undefined = _mm512_undefined_epi32().as_i32x16();
    let extract: i32x4 = match IMM2 {
-        0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
+        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
+        1 => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
+        2 => simd_shuffle4!(a, undefined, [8, 9, 10, 11]),
+        _ => simd_shuffle4!(a, undefined, [12, 13, 14, 15]),
    };
    transmute(extract)
 }
@ -22081,8 +22081,8 @@ pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i
    let a = a.as_i32x8();
    let undefined = _mm256_undefined_si256().as_i32x8();
    let extract: i32x4 = match IMM1 {
-        0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
+        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
+        _ => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
    };
    transmute(extract)
 }
@ -22131,7 +22131,7 @@ pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
-    let r: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let r: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    transmute(r)
 }

@ -22142,7 +22142,7 @@ pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
 }

@ -22153,7 +22153,7 @@ pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
 }
@ -22211,7 +22211,7 @@ pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
-    let r: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let r: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    transmute(r)
 }

@ -22222,7 +22222,7 @@ pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
 }

@ -22233,7 +22233,7 @@ pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
 }
@ -22291,7 +22291,7 @@ pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
-    let r: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let r: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    transmute(r)
 }

@ -22302,7 +22302,7 @@ pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
 }

@ -22313,7 +22313,7 @@ pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
-    let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, mov, zero))
 }
@ -22376,22 +22376,22 @@ pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m
    let a = a.as_i32x16();
    let b = _mm512_castsi128_si512(b).as_i32x16();
    let ret: i32x16 = match IMM8 & 0b11 {
-        0 => simd_shuffle16(
+        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        1 => simd_shuffle16(
+        1 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        2 => simd_shuffle16(
+        2 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
    };
    transmute(ret)
 }
@ -22447,8 +22447,8 @@ pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m
    let a = a.as_i32x8();
    let b = _mm256_castsi128_si256(b).as_i32x8();
    let ret: i32x8 = match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    };
    transmute(ret)
 }
@ -22506,8 +22506,8 @@ pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m
    static_assert_imm1!(IMM8);
    let b = _mm512_castsi256_si512(b);
    match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
 }

@ -22558,22 +22558,22 @@ pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m51
    static_assert_imm2!(IMM8);
    let b = _mm512_castps128_ps512(b);
    match IMM8 & 0b11 {
-        0 => simd_shuffle16(
+        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        1 => simd_shuffle16(
+        1 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
-        2 => simd_shuffle16(
+        2 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
    }
 }

@ -22627,8 +22627,8 @@ pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m25
    static_assert_imm1!(IMM8);
    let b = _mm256_castps128_ps256(b);
    match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
 }

@ -22685,8 +22685,8 @@ pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m
    static_assert_imm1!(IMM8);
    let b = _mm512_castpd256_pd512(b);
    match IMM8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
 }

@ -22736,7 +22736,7 @@ pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    #[rustfmt::skip]
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a, b,
        [ 2, 18, 3, 19,
          2 + 4, 18 + 4, 3 + 4, 19 + 4,
@ -22837,7 +22837,7 @@ pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
 pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
 }

 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -22932,7 +22932,7 @@ pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[cfg_attr(test, assert_instr(vunpckhps))]
 pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
    #[rustfmt::skip]
-    simd_shuffle16(
+    simd_shuffle16!(
        a, b,
        [ 2, 18, 3, 19,
          2 + 4, 18 + 4, 3 + 4, 19 + 4,
@ -23017,7 +23017,7 @@ pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
 }

 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23109,7 +23109,7 @@ pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    #[rustfmt::skip]
-    let r: i32x16 = simd_shuffle16(
+    let r: i32x16 = simd_shuffle16!(
        a, b,
        [ 0, 16, 1, 17,
          0 + 4, 16 + 4, 1 + 4, 17 + 4,
@ -23210,7 +23210,7 @@ pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
 pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
 }

 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23305,7 +23305,7 @@ pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
    #[rustfmt::skip]
-    simd_shuffle16(a, b,
+    simd_shuffle16!(a, b,
                   [ 0, 16, 1, 17,
                     0 + 4, 16 + 4, 1 + 4, 17 + 4,
                     0 + 8, 16 + 8, 1 + 8, 17 + 8,
@ -23389,7 +23389,7 @@ pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
 }

 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23477,7 +23477,7 @@ pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m1
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm_set1_ps(-1.),
        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
@ -23490,7 +23490,7 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm256_set1_ps(-1.),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -23503,7 +23503,7 @@ pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm_set1_ps(0.),
        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
@ -23516,7 +23516,7 @@ pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle16(
+    simd_shuffle16!(
        a,
        _mm256_set1_ps(0.),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -23529,7 +23529,7 @@ pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23538,7 +23538,7 @@ pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

 /// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23565,7 +23565,7 @@ pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
-    simd_shuffle8(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23574,7 +23574,7 @@ pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
-    simd_shuffle8(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23583,7 +23583,7 @@ pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
-    simd_shuffle8(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23592,7 +23592,7 @@ pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
-    simd_shuffle8(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23601,7 +23601,7 @@ pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23610,7 +23610,7 @@ pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23637,7 +23637,7 @@ pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
-    simd_shuffle8(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23646,7 +23646,7 @@ pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
-    simd_shuffle8(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23655,7 +23655,7 @@ pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
-    simd_shuffle8(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
+    simd_shuffle8!(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
 }

 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23664,7 +23664,7 @@ pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
-    simd_shuffle8(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
+    simd_shuffle8!(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
 }

 /// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23673,7 +23673,7 @@ pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
-    simd_shuffle2(a, a, [0, 1])
+    simd_shuffle2!(a, a, [0, 1])
 }

 /// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23682,7 +23682,7 @@ pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
-    simd_shuffle4(a, a, [0, 1, 2, 3])
+    simd_shuffle4!(a, a, [0, 1, 2, 3])
 }

 /// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23722,7 +23722,7 @@ pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
 pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i32x16();
-    let ret: i32x16 = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+    let ret: i32x16 = simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
    transmute(ret)
 }

@ -23802,7 +23802,7 @@ pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
 pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
-    simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23881,7 +23881,7 @@ pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
-    simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23960,7 +23960,7 @@ pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
-    simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
 }

 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24016,7 +24016,7 @@ pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
    let a = a.as_i32x4();
-    let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+    let ret: i32x16 = simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
    transmute(ret)
 }

@ -24048,7 +24048,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
    let a = a.as_i32x4();
-    let ret: i32x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+    let ret: i32x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
    transmute(ret)
 }

@ -24079,7 +24079,7 @@ pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24109,7 +24109,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
-    simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24139,7 +24139,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
 pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24169,7 +24169,7 @@ pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
-    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }

 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24326,66 +24326,62 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
    let b = b.as_i32x16();
    let imm8: i32 = IMM8 % 16;
    let r: i32x16 = match imm8 {
-        0 => simd_shuffle16(
+        0 => simd_shuffle16!(
            a,
            b,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
+            [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,],
        ),
-        1 => simd_shuffle16(
+        1 => simd_shuffle16!(
            a,
            b,
-            [
-                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
-            ],
+            [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,],
        ),
-        2 => simd_shuffle16(
+        2 => simd_shuffle16!(
            a,
            b,
            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
        ),
-        3 => simd_shuffle16(
+        3 => simd_shuffle16!(
            a,
            b,
            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
        ),
-        4 => simd_shuffle16(
+        4 => simd_shuffle16!(
            a,
            b,
            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
        ),
-        5 => simd_shuffle16(
+        5 => simd_shuffle16!(
            a,
            b,
            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
        ),
-        6 => simd_shuffle16(
+        6 => simd_shuffle16!(
            a,
            b,
            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
        ),
-        7 => simd_shuffle16(
+        7 => simd_shuffle16!(
            a,
            b,
            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
        ),
-        8 => simd_shuffle16(
+        8 => simd_shuffle16!(
            a,
            b,
            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
        ),
-        9 => simd_shuffle16(
+        9 => simd_shuffle16!(
            a,
            b,
            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
        ),
-        10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+        10 => simd_shuffle16!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle16!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle16!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle16!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle16!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle16!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
    };
    transmute(r)
 }
@ -24439,22 +24435,22 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
    let b = b.as_i32x8();
    let imm8: i32 = IMM8 % 16;
    let r: i32x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        7 => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        8 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        9 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        10 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
    };
    transmute(r)
 }
@ -24508,14 +24504,14 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
    let b = b.as_i32x4();
    let imm8: i32 = IMM8 % 8;
    let r: i32x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
-        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
-        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
+        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4!(a, b, [1, 2, 3, 0]),
+        6 => simd_shuffle4!(a, b, [2, 3, 0, 1]),
+        _ => simd_shuffle4!(a, b, [3, 0, 1, 2]),
    };
    transmute(r)
 }
@ -24567,14 +24563,14 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 8;
    let r: i64x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        _ => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
    };
    transmute(r)
 }
@ -24626,14 +24622,14 @@ pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 8;
    let r: i64x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
+        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
+        6 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
+        _ => simd_shuffle4!(a, b, [3, 4, 5, 6]),
    };
    transmute(r)
 }
@ -24685,10 +24681,10 @@ pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 4;
    let r: i64x2 = match imm8 {
-        0 => simd_shuffle2(a, b, [2, 3]),
-        1 => simd_shuffle2(a, b, [3, 0]),
-        2 => simd_shuffle2(a, b, [0, 1]),
-        _ => simd_shuffle2(a, b, [1, 2]),
+        0 => simd_shuffle2!(a, b, [2, 3]),
+        1 => simd_shuffle2!(a, b, [3, 0]),
+        2 => simd_shuffle2!(a, b, [0, 1]),
+        _ => simd_shuffle2!(a, b, [1, 2]),
    };
    transmute(r)
 }
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@ -350,7 +350,7 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpltss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3])
 }

 /// Compares the lowest `f32` of both inputs for greater than or equal. The
@ -364,7 +364,7 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpless))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3])
 }

 /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
@ -420,7 +420,7 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpnltss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3])
 }

 /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
@ -434,7 +434,7 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(cmpnless))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3])
+    simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3])
 }

 /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
@ -1011,10 +1011,10 @@ pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_imm8!(MASK);
-    simd_shuffle4(
+    simd_shuffle4!(
        a,
        b,
-        [
+        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 4,
@ -1032,7 +1032,7 @@ pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(unpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [2, 6, 3, 7])
+    simd_shuffle4!(a, b, [2, 6, 3, 7])
 }

 /// Unpacks and interleave single-precision (32-bit) floating-point elements
@ -1044,7 +1044,7 @@ pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(unpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [0, 4, 1, 5])
+    simd_shuffle4!(a, b, [0, 4, 1, 5])
 }

 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
@ -1057,7 +1057,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
    // TODO; figure why this is a different instruction on Windows?
-    simd_shuffle4(a, b, [6, 7, 2, 3])
+    simd_shuffle4!(a, b, [6, 7, 2, 3])
 }

 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
@ -1069,7 +1069,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [0, 1, 4, 5])
+    simd_shuffle4!(a, b, [0, 1, 4, 5])
 }

 /// Returns a mask of the most significant bit of each element in `a`.
@ -1201,7 +1201,7 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
    let a = _mm_load_ps(p);
-    simd_shuffle4(a, a, [3, 2, 1, 0])
+    simd_shuffle4!(a, a, [3, 2, 1, 0])
 }

 /// Loads unaligned 64-bits of integer data from memory into new vector.
@ -1253,7 +1253,7 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
-    let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]);
+    let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]);
    *(p as *mut __m128) = b;
 }

@ -1329,7 +1329,7 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
-    let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]);
+    let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]);
    *(p as *mut __m128) = b;
 }

@ -1347,7 +1347,7 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
 #[cfg_attr(test, assert_instr(movss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
-    simd_shuffle4(a, b, [4, 1, 2, 3])
+    simd_shuffle4!(a, b, [4, 1, 2, 3])
 }

 /// Performs a serializing operation on all store-to-memory instructions that
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@ -432,10 +432,10 @@ unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
        }
    }
    let zero = _mm_set1_epi8(0).as_i8x16();
-    transmute(simd_shuffle16::<i8x16, i8x16>(
+    transmute::<i8x16, _>(simd_shuffle16!(
        zero,
        a.as_i8x16(),
-        [
+        <const IMM8: i32> [
            mask(IMM8, 0),
            mask(IMM8, 1),
            mask(IMM8, 2),
@ -635,10 +635,10 @@ unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
        }
    }
    let zero = _mm_set1_epi8(0).as_i8x16();
-    let x: i8x16 = simd_shuffle16(
+    let x: i8x16 = simd_shuffle16!(
        a.as_i8x16(),
        zero,
-        [
+        <const IMM8: i32> [
            mask(IMM8, 0),
            mask(IMM8, 1),
            mask(IMM8, 2),
@ -895,7 +895,7 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
    let a = a.as_i32x4();
-    simd_cast::<i32x2, __m128d>(simd_shuffle2(a, a, [0, 1]))
+    simd_cast::<i32x2, __m128d>(simd_shuffle2!(a, a, [0, 1]))
 }

 /// Returns `a` with its lower element replaced by `b` after converting it to
@ -1303,7 +1303,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
    let zero = _mm_setzero_si128();
-    let r: i64x2 = simd_shuffle2(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
+    let r: i64x2 = simd_shuffle2!(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
    transmute(r)
 }

@ -1391,10 +1391,10 @@ pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
 pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
-    let x: i32x4 = simd_shuffle4(
+    let x: i32x4 = simd_shuffle4!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -1419,10 +1419,10 @@ pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x8();
-    let x: i16x8 = simd_shuffle8(
+    let x: i16x8 = simd_shuffle8!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            0,
            1,
            2,
@ -1451,10 +1451,10 @@ pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i16x8();
-    let x: i16x8 = simd_shuffle8(
+    let x: i16x8 = simd_shuffle8!(
        a,
        a,
-        [
+        <const IMM8: i32> [
            IMM8 as u32 & 0b11,
            (IMM8 as u32 >> 2) & 0b11,
            (IMM8 as u32 >> 4) & 0b11,
@ -1476,7 +1476,7 @@ pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpckhbw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i8x16, _>(simd_shuffle16(
+    transmute::<i8x16, _>(simd_shuffle16!(
        a.as_i8x16(),
        b.as_i8x16(),
        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
@ -1491,7 +1491,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpckhwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+    let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
    transmute::<i16x8, _>(x)
 }

@ -1503,7 +1503,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(unpckhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
+    transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
 }

 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
@ -1514,7 +1514,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(unpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [1, 3]))
+    transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [1, 3]))
 }

 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
@ -1525,7 +1525,7 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpcklbw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i8x16, _>(simd_shuffle16(
+    transmute::<i8x16, _>(simd_shuffle16!(
        a.as_i8x16(),
        b.as_i8x16(),
        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
@ -1540,7 +1540,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(punpcklwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+    let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
    transmute::<i16x8, _>(x)
 }

@ -1552,7 +1552,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(unpcklps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
+    transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
 }

 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
@ -1563,7 +1563,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [0, 2]))
+    transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [0, 2]))
 }

 /// Returns a new vector with the low element of `a` replaced by the sum of the
@ -2519,7 +2519,7 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
-    let b: __m128d = simd_shuffle2(a, a, [0, 0]);
+    let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
    *(mem_addr as *mut __m128d) = b;
 }

@ -2533,7 +2533,7 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
-    let b: __m128d = simd_shuffle2(a, a, [0, 0]);
+    let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
    *(mem_addr as *mut __m128d) = b;
 }

@ -2548,7 +2548,7 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
-    let b: __m128d = simd_shuffle2(a, a, [1, 0]);
+    let b: __m128d = simd_shuffle2!(a, a, [1, 0]);
    *(mem_addr as *mut __m128d) = b;
 }

@ -2612,7 +2612,7 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
    let a = _mm_load_pd(mem_addr);
-    simd_shuffle2(a, a, [1, 0])
+    simd_shuffle2!(a, a, [1, 0])
 }

 /// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@ -2653,7 +2653,7 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_imm8!(MASK);
-    simd_shuffle2(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
+    simd_shuffle2!(a, b, <const MASK: i32> [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
 }

 /// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
@ -2777,7 +2777,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 #[cfg_attr(test, assert_instr(unpckhpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
-    simd_shuffle2(a, b, [1, 3])
+    simd_shuffle2!(a, b, [1, 3])
 }

 /// The resulting `__m128d` element is composed by the high-order values of
@ -2792,7 +2792,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
-    simd_shuffle2(a, b, [0, 2])
+    simd_shuffle2!(a, b, [0, 2])
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@ -106,7 +106,7 @@ pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(movddup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
-    simd_shuffle2(a, a, [0, 0])
+    simd_shuffle2!(a, a, [0, 0])
 }

 /// Loads a double-precision (64-bit) floating-point element from memory
@ -130,7 +130,7 @@ pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
 #[cfg_attr(test, assert_instr(movshdup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
-    simd_shuffle4(a, a, [1, 1, 3, 3])
+    simd_shuffle4!(a, a, [1, 1, 3, 3])
 }

 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
@ -142,7 +142,7 @@ pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(movsldup))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
-    simd_shuffle4(a, a, [0, 0, 2, 2])
+    simd_shuffle4!(a, a, [0, 0, 2, 2])
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@ -379,7 +379,7 @@ pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute(simd_cast::<_, i16x8>(a))
 }

@ -392,7 +392,7 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
+    let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@ -406,7 +406,7 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
+    let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@ -419,7 +419,7 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
-    let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
+    let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@ -432,7 +432,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
-    let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
+    let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@ -445,7 +445,7 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
    let a = a.as_i32x4();
-    let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
+    let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@ -458,7 +458,7 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
-    let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute(simd_cast::<_, i16x8>(a))
 }

@ -471,7 +471,7 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
-    let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
+    let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@ -484,7 +484,7 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
    let a = a.as_u8x16();
-    let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
+    let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@ -498,7 +498,7 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
    let a = a.as_u16x8();
-    let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
+    let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
    transmute(simd_cast::<_, i32x4>(a))
 }

@ -512,7 +512,7 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
    let a = a.as_u16x8();
-    let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
+    let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

@ -526,7 +526,7 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
    let a = a.as_u32x4();
-    let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
+    let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute(simd_cast::<_, i64x2>(a))
 }

--- a/library/stdarch/crates/core_arch/src/x86/ssse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@ -113,10 +113,10 @@ pub unsafe fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
            shift + i
        }
    }
-    let r: i8x16 = simd_shuffle16(
+    let r: i8x16 = simd_shuffle16!(
        b.as_i8x16(),
        a.as_i8x16(),
-        [
+        <const IMM8: i32> [
            mask(IMM8 as u32, 0),
            mask(IMM8 as u32, 1),
            mask(IMM8 as u32, 2),
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@ -194,8 +194,8 @@ generate int32x2_t:int32x2_t:int64x2_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle8, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, {vabd_u8, c, d}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
@ -207,8 +207,8 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle4, c:uint16x4_t, a, a, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, {vabd_u16, c, d}
 a = 1, 2, 3, 4, 8, 9, 11, 12
 b = 10, 10, 10, 10, 10, 10, 10, 10
@ -220,8 +220,8 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle2, c:uint32x2_t, a, a, [2, 3]
-multi_fn = simd_shuffle2, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, {vabd_u32, c, d}
 a = 1, 2, 3, 4
 b = 10, 10, 10, 10
@ -233,8 +233,8 @@ generate uint32x4_t:uint32x4_t:uint64x2_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle8, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@ -247,8 +247,8 @@ generate int8x16_t:int8x16_t:int16x8_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle4, c:int16x4_t, a, a, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4, 9, 10, 11, 12
@ -261,8 +261,8 @@ generate int16x8_t:int16x8_t:int32x4_t
 /// Signed Absolute difference Long
 name = vabdl_high
 no-q
-multi_fn = simd_shuffle2, c:int32x2_t, a, a, [2, 3]
-multi_fn = simd_shuffle2, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d}
 multi_fn = simd_cast, e
 a = 1, 2, 3, 4
@ -727,7 +727,7 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@ -744,7 +744,7 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0., 0.5, 0., 0.
 n = 0:1
@ -759,8 +759,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
+multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@ -777,8 +777,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
+multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0., 0.5, 0., 0.
 n = 0:1
@ -793,8 +793,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 0:1
@ -811,8 +811,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1:0
@ -827,8 +827,8 @@ lane-suffixes
 constn = LANE1:LANE2
 multi_fn = static_assert_imm-in0_exp_len-LANE1
 multi_fn = static_assert_imm-in_exp_len-LANE2
-multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
-multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
+multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
+multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
 a = 1., 2., 3., 4.
 b = 0.5, 0., 0., 0.
 n = 1:0
@ -897,7 +897,7 @@ generate float32x2_t:float64x2_t
 /// Floating-point convert to higher precision long
 name = vcvt_high
 noq-double-suffixes
-multi_fn = simd_shuffle2, b:float32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, b
 a = -1.2, 1.2, 2.3, 3.4
 validate 2.3f32 as f64, 3.4f32 as f64
@ -918,7 +918,7 @@ generate float64x2_t:float32x2_t
 /// Floating-point convert to lower precision narrow
 name = vcvt_high
 noq-double-suffixes
-multi_fn = simd_shuffle4, a, {simd_cast, b}, [0, 1, 2, 3]
+multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3]
 a = -1.2, 1.2
 b = -2.3, 3.4
 validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
@ -939,7 +939,7 @@ generate float64x2_t:float32x2_t
 /// Floating-point convert to lower precision narrow, rounding to odd
 name = vcvtx_high
 noq-double-suffixes
-multi_fn = simd_shuffle4, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
+multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
 a = -1.0, 2.0
 b = -3.0, 4.0
 validate -1.0, 2.0, -3.0, 4.0
@ -1162,7 +1162,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@ -1188,7 +1188,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@ -1202,7 +1202,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
 a = 1., 1., 1., 4.
 n = HFLEN
 validate 1., 1., 1., 1.
@ -1303,7 +1303,7 @@ generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
 a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
 n = HFLEN
@ -1317,7 +1317,7 @@ generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
 a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
 n = HFLEN
@ -1333,7 +1333,7 @@ generate int64x2_t, uint64x2_t
 name = vext
 constn = N
 multi_fn = static_assert_imm-out_exp_len-N
-multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
+multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
 a = 0., 2., 2., 3.
 b = 3., 4., 5., 6.,
 n = HFLEN
@ -1403,7 +1403,7 @@ name = vmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1422,7 +1422,7 @@ name = vmla
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 0., 1., 2., 3.
 b = 2., 2., 2., 2.
 c = 0., 3., 0., 0.
@ -1477,7 +1477,7 @@ name = vmlal_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1495,8 +1495,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
 /// Signed multiply-add long
 name = vmlal_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlal-noqself-noext, a, b, c
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1510,8 +1510,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
 /// Unsigned multiply-add long
 name = vmlal_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlal-noqself-noext, a, b, c
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1541,7 +1541,7 @@ name = vmlal_high_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 8, 7, 6, 5, 4, 3, 2, 1
 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1613,7 +1613,7 @@ name = vmls
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1632,7 +1632,7 @@ name = vmls
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 6., 7., 8., 9.
 b = 2., 2., 2., 2.
 c = 0., 3., 0., 0.
@ -1687,7 +1687,7 @@ name = vmlsl_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1705,8 +1705,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
 /// Signed multiply-subtract long
 name = vmlsl_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlsl-noqself-noext, a, b, c
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1720,8 +1720,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
 /// Unsigned multiply-subtract long
 name = vmlsl_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
 multi_fn = vmlsl-noqself-noext, a, b, c
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1751,7 +1751,7 @@ name = vmlsl_high_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
 a = 14, 15, 16, 17, 18, 19, 20, 21
 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1769,7 +1769,7 @@ generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint
 name = vmovn_high
 no-q
 multi_fn = simd_cast, c:in_t0, b
-multi_fn = simd_shuffle-out_len-noext, a, c, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 4, 5
 b = 2, 3, 4, 5, 12, 13, 14, 15
 validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
@ -2070,7 +2070,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
+multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@ -2102,7 +2102,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
+multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@ -2155,8 +2155,8 @@ generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:i
 /// Signed multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@ -2181,8 +2181,8 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
 /// Unsigned multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@ -2222,8 +2222,8 @@ link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t
 /// Polynomial multiply long
 name = vmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
 multi_fn = vmull-noqself-noext, a, b
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
@ -2263,7 +2263,7 @@ generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
 name = vmull_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@ -2294,7 +2294,7 @@ generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
 name = vmull_high_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
 a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1
@ -2336,7 +2336,7 @@ name = vmulx
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
+multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@ -2573,7 +2573,7 @@ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
 name = vsubhn_high
 no-q
 multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
-multi_fn = simd_shuffle-out_len-noext, a, d, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len}
 a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
 b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
 c = 1, 0, 1, 0, 1, 0, 1, 0
@ -2629,7 +2629,7 @@ generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle8, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 12, 13, 14, 15, 16
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
@ -2641,7 +2641,7 @@ generate int16x8_t:int8x16_t:int16x8_t
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle4, c:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11
 b = 0, 1, 2, 3, 8, 9, 10, 11
@ -2653,7 +2653,7 @@ generate int32x4_t:int16x8_t:int32x4_t
 /// Signed Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle2, c:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9
 b = 6, 7, 8, 9
@ -2665,7 +2665,7 @@ generate int64x2_t:int32x4_t:int64x2_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle8, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11, 12, 13, 14, 15
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@ -2677,7 +2677,7 @@ generate uint16x8_t:uint8x16_t:uint16x8_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle4, c:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9, 10, 11
 b = 0, 1, 2, 3, 8, 9, 10, 11
@ -2689,7 +2689,7 @@ generate uint32x4_t:uint16x8_t:uint32x4_t
 /// Unsigned Subtract Wide
 name = vsubw_high
 no-q
-multi_fn = simd_shuffle2, c:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_sub, a, {simd_cast, c}
 a = 8, 9
 b = 6, 7, 8, 9
@ -2731,9 +2731,9 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle8, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle8, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@ -2747,9 +2747,9 @@ generate int8x16_t:int8x16_t:int16x8_t
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle4, c:int16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle4, e:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@ -2763,9 +2763,9 @@ generate int16x8_t:int16x8_t:int32x4_t
 /// Signed Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle2, c:int32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle2, e:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@ -2779,9 +2779,9 @@ generate int32x4_t:int32x4_t:int64x2_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle8, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle8, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@ -2795,9 +2795,9 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle4, c:uint16x4_t, a, a, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle4, e:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@ -2811,9 +2811,9 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Subtract Long
 name = vsubl_high
 no-q
-multi_fn = simd_shuffle2, c:uint32x2_t, a, a, [2, 3]
+multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
 multi_fn = simd_cast, d:out_t, c
-multi_fn = simd_shuffle2, e:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3]
 multi_fn = simd_cast, f:out_t, e
 multi_fn = simd_sub, d, f

@ -3011,8 +3011,8 @@ generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
 /// Signed saturating doubling multiply long
 name = vqdmull_high
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-halflen-halflen}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 1, 2, 5, 6, 5, 6, 7, 8
@ -3024,7 +3024,7 @@ generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
 /// Signed saturating doubling multiply long
 name = vqdmull_high_n
 no-q
-multi_fn = simd_shuffle-out_len-noext, a:in_ntt, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len}
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
 multi_fn = vqdmull-in_ntt-noext, a, b
 a = 0, 2, 8, 10, 8, 10, 12, 14
@ -3038,7 +3038,7 @@ generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
 name = vqdmull_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, b:in_t0, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 1, 2, 3, 4
 b = 0, 2, 2, 0, 2, 0, 0, 0
@ -3083,8 +3083,8 @@ generate i32:int32x2_t:i64, i32:int32x4_t:i64
 name = vqdmull_high_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a:in_t, a, a, {asc-out_len-out_len}
-multi_fn = simd_shuffle-out_len-noext, b:in_t, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-self-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 0, 2, 2, 0, 2, 0, 0, 0
@ -3098,8 +3098,8 @@ generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t
 name = vqdmull_high_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-out_len-out_len}
-multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {dup-out_len-N as u32}
+multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len}
+multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32}
 multi_fn = vqdmull-noqself-noext, a, b
 a = 0, 1, 4, 5, 4, 5, 6, 7
 b = 0, 2, 2, 0, 2, 0, 0, 0
@ -3390,7 +3390,7 @@ name = vqrdmulh
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_shuffle-out_len-noext, b:out_t, b, b, {dup-out_len-LANE as u32}
+multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32}
 multi_fn = vqrdmulh-out-noext, a, b
 a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 b = 0, 2, 0, 0, 0, 0, 0, 0,
@ -3616,7 +3616,7 @@ name = vqrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@ -3662,7 +3662,7 @@ name = vqrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@ -3708,7 +3708,7 @@ name = vqrshrun_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 2, 3, 2, 3, 6, 7
 b = 8, 12, 24, 28, 48, 52, 56, 60
 n = 2
@ -3858,7 +3858,7 @@ name = vqshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@ -3903,7 +3903,7 @@ name = vqshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@ -3948,7 +3948,7 @@ name = vqshrun_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@ -4312,7 +4312,7 @@ name = vrshrn_high
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 0, 1, 8, 9, 8, 9, 10, 11
 b = 32, 36, 40, 44, 48, 52, 56, 60
 n = 2
@ -4542,7 +4542,7 @@ name = vshll_high_n
 no-q
 constn = N
 multi_fn = static_assert-N-0-bits
-multi_fn = simd_shuffle-out_len-noext, b:half, a, a, {asc-halflen-halflen}
+multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen}
 multi_fn = vshll_n-noqself-::<N>, b
 a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8
 n = 2
@ -4589,7 +4589,7 @@ name = vshrn_high_n
 no-q
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_shuffle-out_len-noext, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
+multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
 a = 1, 2, 5, 6, 5, 6, 7, 8
 b = 20, 24, 28, 32, 52, 56, 60, 64
 n = 2
@ -4631,7 +4631,7 @@ generate uint*_t, uint64x*_t

 /// Transpose vectors
 name = vtrn1
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
@ -4644,7 +4644,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Transpose vectors
 name = vtrn1
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 0., 1., 4., 5., 8., 9., 12., 13.
@ -4657,7 +4657,7 @@ generate float32x2_t, float64x2_t

 /// Transpose vectors
 name = vtrn2
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
@ -4670,7 +4670,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Transpose vectors
 name = vtrn2
-multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 2., 3., 6., 7., 10., 11., 14., 15.
@ -4683,7 +4683,7 @@ generate float32x2_t, float64x2_t

 /// Zip vectors
 name = vzip1
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
 a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@ -4693,7 +4693,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4

 /// Zip vectors
 name = vzip1
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
 a = 0., 2., 4., 6., 8., 10., 12., 14.
 b = 1., 3., 5., 7., 9., 11., 13., 15.
 validate 0., 1., 2., 3., 4., 5., 6., 7.
@ -4703,7 +4703,7 @@ generate float32x2_t, float32x4_t, float64x2_t

 /// Zip vectors
 name = vzip2
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
 a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30
 b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31
 validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -4713,7 +4713,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4

 /// Zip vectors
 name = vzip2
-multi_fn = simd_shuffle-in_len-noext, a, b, {zip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
 a = 0., 8., 8., 10., 8., 10., 12., 14.
 b = 1., 9., 9., 11., 9., 11., 13., 15.
 validate 8., 9., 10., 11., 12., 13., 14., 15.
@ -4723,7 +4723,7 @@ generate float32x2_t, float32x4_t, float64x2_t

 /// Unzip vectors
 name = vuzp1
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
 a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0
 b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0
 validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16
@ -4736,7 +4736,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Unzip vectors
 name = vuzp1
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-1-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
 a = 0., 8., 1., 9., 4., 12., 5., 13.
 b = 1., 10., 3., 11., 6., 14., 7., 15.
 validate 0., 1., 1., 3., 4., 5., 6., 7.
@ -4749,7 +4749,7 @@ generate float32x2_t, float64x2_t

 /// Unzip vectors
 name = vuzp2
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
 a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24
 b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32
 validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32
@ -4762,7 +4762,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t

 /// Unzip vectors
 name = vuzp2
-multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-2-in_len}
+multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
 a = 0., 8., 1., 9., 4., 12., 5., 13.
 b = 2., 9., 3., 11., 6., 14., 7., 15.
 validate 8., 9., 9., 11., 12., 13., 14., 15.
@ -4793,8 +4793,8 @@ generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle8, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = vabd_u8, d, e, f:uint8x8_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 9, 10, 11, 12, 13, 14, 15, 16
@ -4808,8 +4808,8 @@ generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle4, d:uint16x4_t, b, b, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, e:uint16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7]
 multi_fn = vabd_u16, d, e, f:uint16x4_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 9, 10, 11, 12
@ -4823,8 +4823,8 @@ generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
 /// Unsigned Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle2, d:uint32x2_t, b, b, [2, 3]
-multi_fn = simd_shuffle2, e:uint32x2_t, c, c, [2, 3]
+multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3]
 multi_fn = vabd_u32, d, e, f:uint32x2_t
 multi_fn = simd_add, a, {simd_cast, f}
 a = 15, 16
@ -4884,8 +4884,8 @@ generate int64x2_t:int32x2_t:int32x2_t:int64x2_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle8, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
-multi_fn = simd_shuffle8, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
+multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
 multi_fn = vabd_s8, d, e, f:int8x8_t
 multi_fn = simd_cast, f:uint8x8_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@ -4900,8 +4900,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle4, d:int16x4_t, b, b, [4, 5, 6, 7]
-multi_fn = simd_shuffle4, e:int16x4_t, c, c, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
+multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7]
 multi_fn = vabd_s16, d, e, f:int16x4_t
 multi_fn = simd_cast, f:uint16x4_t, f
 multi_fn = simd_add, a, {simd_cast, f}
@ -4916,8 +4916,8 @@ generate int32x4_t:int16x8_t:int16x8_t:int32x4_t
 /// Signed Absolute difference and Accumulate Long
 name = vabal_high
 no-q
-multi_fn = simd_shuffle2, d:int32x2_t, b, b, [2, 3]
-multi_fn = simd_shuffle2, e:int32x2_t, c, c, [2, 3]
+multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
+multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3]
 multi_fn = vabd_s32, d, e, f:int32x2_t
 multi_fn = simd_cast, f:uint32x2_t, f
 multi_fn = simd_add, a, {simd_cast, f}
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@ -988,6 +988,17 @@ fn gen_aarch64(
            );
        }
    };
+    let const_declare = if let Some(constn) = constn {
+        if constn.contains(":") {
+            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
+            assert_eq!(constns.len(), 2);
+            format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
+        } else {
+            format!(r#"<const {}: i32>"#, constn)
+        }
+    } else {
+        String::new()
+    };
    let multi_calls = if !multi_fn.is_empty() {
        let mut calls = String::new();
        for i in 0..multi_fn.len() {
@ -997,6 +1008,7 @@ fn gen_aarch64(
            calls.push_str(&get_call(
                &multi_fn[i],
                current_name,
+                &const_declare,
                in_t,
                out_t,
                fixed,
@ -1007,17 +1019,6 @@ fn gen_aarch64(
    } else {
        String::new()
    };
-    let const_declare = if let Some(constn) = constn {
-        if constn.contains(":") {
-            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
-            assert_eq!(constns.len(), 2);
-            format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
-        } else {
-            format!(r#"<const {}: i32>"#, constn)
-        }
-    } else {
-        String::new()
-    };
    let const_assert = if let Some(constn) = constn {
        if constn.contains(":") {
            let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
@ -1582,6 +1583,11 @@ fn gen_arm(
            ));
        }
    };
+    let const_declare = if let Some(constn) = constn {
+        format!(r#"<const {}: i32>"#, constn)
+    } else {
+        String::new()
+    };
    let multi_calls = if !multi_fn.is_empty() {
        let mut calls = String::new();
        for i in 0..multi_fn.len() {
@ -1591,6 +1597,7 @@ fn gen_arm(
            calls.push_str(&get_call(
                &multi_fn[i],
                current_name,
+                &const_declare,
                in_t,
                out_t,
                fixed,
@ -1601,11 +1608,6 @@ fn gen_arm(
    } else {
        String::new()
    };
-    let const_declare = if let Some(constn) = constn {
-        format!(r#"<const {}: i32>"#, constn)
-    } else {
-        String::new()
-    };
    let const_assert = if let Some(constn) = constn {
        format!(
            r#", {} = {}"#,
@ -2003,6 +2005,7 @@ fn expand_intrinsic(intr: &str, t: &str) -> String {
 fn get_call(
    in_str: &str,
    current_name: &str,
+    const_declare: &str,
    in_t: &[&str; 3],
    out_t: &str,
    fixed: &Vec<String>,
@ -2041,7 +2044,7 @@ fn get_call(
            "halflen" => type_len(in_t[1]) / 2,
            _ => 0,
        };
-        let mut s = String::from("[");
+        let mut s = format!("{} [", const_declare);
        for i in 0..len {
            if i != 0 {
                s.push_str(", ");
@ -2084,7 +2087,7 @@ fn get_call(
            "in0_len" => type_len(in_t[0]),
            _ => 0,
        };
-        let mut s = String::from("[");
+        let mut s = format!("{} [", const_declare);
        for i in 0..len {
            if i != 0 {
                s.push_str(", ");
@ -2167,7 +2170,15 @@ fn get_call(
            let sub_match = format!(
                "        {} => {},\n",
                i,
-                get_call(&sub_call, current_name, in_t, out_t, fixed, Some(i as i32))
+                get_call(
+                    &sub_call,
+                    current_name,
+                    const_declare,
+                    in_t,
+                    out_t,
+                    fixed,
+                    Some(i as i32)
+                )
            );
            call.push_str(&sub_match);
        }
@ -2210,6 +2221,7 @@ fn get_call(
            let sub_call = get_call(
                &sub_fn[1..sub_fn.len() - 1],
                current_name,
+                const_declare,
                in_t,
                out_t,
                fixed,