manually const-ify shuffle arguments (#1160)

This commit is contained in:
Ralf Jung 2021-05-11 22:11:52 +02:00 committed by GitHub
parent 7516a80c31
commit a34883b5d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 1655 additions and 1549 deletions

File diff suppressed because it is too large Load diff

View file

@ -1595,7 +1595,7 @@ pub unsafe fn vext_f64<const N: i32>(a: float64x1_t, _b: float64x1_t) -> float64
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
simd_shuffle16(
simd_shuffle16!(
low,
high,
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -1607,7 +1607,7 @@ pub unsafe fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Vector combine
@ -1615,7 +1615,7 @@ pub unsafe fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
simd_shuffle4(low, high, [0, 1, 2, 3])
simd_shuffle4!(low, high, [0, 1, 2, 3])
}
/// Vector combine
@ -1623,7 +1623,7 @@ pub unsafe fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
simd_shuffle2(low, high, [0, 1])
simd_shuffle2!(low, high, [0, 1])
}
/// Vector combine
@ -1631,7 +1631,7 @@ pub unsafe fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
simd_shuffle16(
simd_shuffle16!(
low,
high,
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -1643,7 +1643,7 @@ pub unsafe fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Vector combine
@ -1651,7 +1651,7 @@ pub unsafe fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
simd_shuffle4(low, high, [0, 1, 2, 3])
simd_shuffle4!(low, high, [0, 1, 2, 3])
}
/// Vector combine
@ -1659,7 +1659,7 @@ pub unsafe fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
simd_shuffle2(low, high, [0, 1])
simd_shuffle2!(low, high, [0, 1])
}
/// Vector combine
@ -1667,7 +1667,7 @@ pub unsafe fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_p64(low: poly64x1_t, high: poly64x1_t) -> poly64x2_t {
simd_shuffle2(low, high, [0, 1])
simd_shuffle2!(low, high, [0, 1])
}
/// Duplicate vector element to vector or scalar
@ -1772,7 +1772,7 @@ pub unsafe fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_f16 ( low: float16x4_t, high: float16x4_t) -> float16x8_t {
simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
}
*/
@ -1781,7 +1781,7 @@ pub unsafe fn vcombine_f16 ( low: float16x4_t, high: float16x4_t) -> float16x8_
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
simd_shuffle4(low, high, [0, 1, 2, 3])
simd_shuffle4!(low, high, [0, 1, 2, 3])
}
/// Vector combine
@ -1789,7 +1789,7 @@ pub unsafe fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
simd_shuffle16(
simd_shuffle16!(
low,
high,
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
@ -1801,7 +1801,7 @@ pub unsafe fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
simd_shuffle8(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(low, high, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Vector combine
@ -1809,7 +1809,7 @@ pub unsafe fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t {
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(mov))]
pub unsafe fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t {
simd_shuffle2(low, high, [0, 1])
simd_shuffle2!(low, high, [0, 1])
}
/// Table look-up

View file

@ -580,7 +580,7 @@ pub unsafe fn vld1q_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x4_t)
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
let x = vld1_lane_s8::<0>(ptr, transmute(i8x8::splat(0)));
simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -591,7 +591,7 @@ pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
let x = vld1q_lane_s8::<0>(ptr, transmute(i8x16::splat(0)));
simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -602,7 +602,7 @@ pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
let x = vld1_lane_s16::<0>(ptr, transmute(i16x4::splat(0)));
simd_shuffle4(x, x, [0, 0, 0, 0])
simd_shuffle4!(x, x, [0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -613,7 +613,7 @@ pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
let x = vld1q_lane_s16::<0>(ptr, transmute(i16x8::splat(0)));
simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -624,7 +624,7 @@ pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
let x = vld1_lane_s32::<0>(ptr, transmute(i32x2::splat(0)));
simd_shuffle2(x, x, [0, 0])
simd_shuffle2!(x, x, [0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -635,7 +635,7 @@ pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
let x = vld1q_lane_s32::<0>(ptr, transmute(i32x4::splat(0)));
simd_shuffle4(x, x, [0, 0, 0, 0])
simd_shuffle4!(x, x, [0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -663,7 +663,7 @@ pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
let x = vld1q_lane_s64::<0>(ptr, transmute(i64x2::splat(0)));
simd_shuffle2(x, x, [0, 0])
simd_shuffle2!(x, x, [0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -674,7 +674,7 @@ pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
let x = vld1_lane_u8::<0>(ptr, transmute(u8x8::splat(0)));
simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -685,7 +685,7 @@ pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
let x = vld1q_lane_u8::<0>(ptr, transmute(u8x16::splat(0)));
simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -696,7 +696,7 @@ pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
let x = vld1_lane_u16::<0>(ptr, transmute(u16x4::splat(0)));
simd_shuffle4(x, x, [0, 0, 0, 0])
simd_shuffle4!(x, x, [0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -707,7 +707,7 @@ pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
let x = vld1q_lane_u16::<0>(ptr, transmute(u16x8::splat(0)));
simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -718,7 +718,7 @@ pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
let x = vld1_lane_u32::<0>(ptr, transmute(u32x2::splat(0)));
simd_shuffle2(x, x, [0, 0])
simd_shuffle2!(x, x, [0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -729,7 +729,7 @@ pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
let x = vld1q_lane_u32::<0>(ptr, transmute(u32x4::splat(0)));
simd_shuffle4(x, x, [0, 0, 0, 0])
simd_shuffle4!(x, x, [0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -757,7 +757,7 @@ pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
let x = vld1q_lane_u64::<0>(ptr, transmute(u64x2::splat(0)));
simd_shuffle2(x, x, [0, 0])
simd_shuffle2!(x, x, [0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -768,7 +768,7 @@ pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
let x = vld1_lane_p8::<0>(ptr, transmute(u8x8::splat(0)));
simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -779,7 +779,7 @@ pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
let x = vld1q_lane_p8::<0>(ptr, transmute(u8x16::splat(0)));
simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle16!(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -790,7 +790,7 @@ pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
let x = vld1_lane_p16::<0>(ptr, transmute(u16x4::splat(0)));
simd_shuffle4(x, x, [0, 0, 0, 0])
simd_shuffle4!(x, x, [0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -801,7 +801,7 @@ pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
let x = vld1q_lane_p16::<0>(ptr, transmute(u16x8::splat(0)));
simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -812,7 +812,7 @@ pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
let x = vld1_lane_f32::<0>(ptr, transmute(f32x2::splat(0.)));
simd_shuffle2(x, x, [0, 0])
simd_shuffle2!(x, x, [0, 0])
}
/// Load one single-element structure and Replicate to all lanes (of one register).
@ -823,7 +823,7 @@ pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
let x = vld1q_lane_f32::<0>(ptr, transmute(f32x4::splat(0.)));
simd_shuffle4(x, x, [0, 0, 0, 0])
simd_shuffle4!(x, x, [0, 0, 0, 0])
}
// signed absolute difference and accumulate (64-bit)
@ -1284,8 +1284,8 @@ pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
let a: int8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let a: int8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let a: int16x8_t = simd_cast(a);
let b: int16x8_t = simd_cast(b);
simd_add(a, b)
@ -1298,8 +1298,8 @@ pub unsafe fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let a: int16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
let a: int32x4_t = simd_cast(a);
let b: int32x4_t = simd_cast(b);
simd_add(a, b)
@ -1312,8 +1312,8 @@ pub unsafe fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl2))]
pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
let a: int32x2_t = simd_shuffle2(a, a, [2, 3]);
let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
let a: int32x2_t = simd_shuffle2!(a, a, [2, 3]);
let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
let a: int64x2_t = simd_cast(a);
let b: int64x2_t = simd_cast(b);
simd_add(a, b)
@ -1326,8 +1326,8 @@ pub unsafe fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
let a: uint8x8_t = simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let a: uint8x8_t = simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let a: uint16x8_t = simd_cast(a);
let b: uint16x8_t = simd_cast(b);
simd_add(a, b)
@ -1340,8 +1340,8 @@ pub unsafe fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
let a: uint16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]);
let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let a: uint16x4_t = simd_shuffle4!(a, a, [4, 5, 6, 7]);
let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
let a: uint32x4_t = simd_cast(a);
let b: uint32x4_t = simd_cast(b);
simd_add(a, b)
@ -1354,8 +1354,8 @@ pub unsafe fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl2))]
pub unsafe fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
let a: uint32x2_t = simd_shuffle2(a, a, [2, 3]);
let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
let a: uint32x2_t = simd_shuffle2!(a, a, [2, 3]);
let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
let a: uint64x2_t = simd_cast(a);
let b: uint64x2_t = simd_cast(b);
simd_add(a, b)
@ -1434,7 +1434,7 @@ pub unsafe fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
let b: int8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: int8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: int16x8_t = simd_cast(b);
simd_add(a, b)
}
@ -1446,7 +1446,7 @@ pub unsafe fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let b: int16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
let b: int32x4_t = simd_cast(b);
simd_add(a, b)
}
@ -1458,7 +1458,7 @@ pub unsafe fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddw2))]
pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
let b: int32x2_t = simd_shuffle2(b, b, [2, 3]);
let b: int32x2_t = simd_shuffle2!(b, b, [2, 3]);
let b: int64x2_t = simd_cast(b);
simd_add(a, b)
}
@ -1470,7 +1470,7 @@ pub unsafe fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
let b: uint8x8_t = simd_shuffle8(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: uint8x8_t = simd_shuffle8!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
let b: uint16x8_t = simd_cast(b);
simd_add(a, b)
}
@ -1482,7 +1482,7 @@ pub unsafe fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
let b: uint16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]);
let b: uint16x4_t = simd_shuffle4!(b, b, [4, 5, 6, 7]);
let b: uint32x4_t = simd_cast(b);
simd_add(a, b)
}
@ -1494,7 +1494,7 @@ pub unsafe fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddw))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddw2))]
pub unsafe fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
let b: uint32x2_t = simd_shuffle2(b, b, [2, 3]);
let b: uint32x2_t = simd_shuffle2!(b, b, [2, 3]);
let b: uint64x2_t = simd_cast(b);
simd_add(a, b)
}
@ -1567,7 +1567,7 @@ pub unsafe fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
}
/// Add returning High Narrow (high half).
@ -1578,7 +1578,7 @@ pub unsafe fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x1
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t(16, 16, 16, 16)));
simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Add returning High Narrow (high half).
@ -1589,7 +1589,7 @@ pub unsafe fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t(32, 32)));
simd_shuffle4(r, x, [0, 1, 2, 3])
simd_shuffle4!(r, x, [0, 1, 2, 3])
}
/// Add returning High Narrow (high half).
@ -1600,7 +1600,7 @@ pub unsafe fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t(8, 8, 8, 8, 8, 8, 8, 8)));
simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
}
/// Add returning High Narrow (high half).
@ -1611,7 +1611,7 @@ pub unsafe fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uin
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t(16, 16, 16, 16)));
simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Add returning High Narrow (high half).
@ -1622,7 +1622,7 @@ pub unsafe fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> ui
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(addhn2))]
pub unsafe fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t(32, 32)));
simd_shuffle4(r, x, [0, 1, 2, 3])
simd_shuffle4!(r, x, [0, 1, 2, 3])
}
/// Rounding Add returning High Narrow.
@ -1693,7 +1693,7 @@ pub unsafe fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
let x = vraddhn_s16_(a, b);
simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
}
/// Rounding Add returning High Narrow (high half).
@ -1704,7 +1704,7 @@ pub unsafe fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
let x = vraddhn_s32_(a, b);
simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Rounding Add returning High Narrow (high half).
@ -1715,7 +1715,7 @@ pub unsafe fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int1
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
let x = vraddhn_s64_(a, b);
simd_shuffle4(r, x, [0, 1, 2, 3])
simd_shuffle4!(r, x, [0, 1, 2, 3])
}
/// Rounding Add returning High Narrow (high half).
@ -1726,7 +1726,7 @@ pub unsafe fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int3
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
let x: uint8x8_t = transmute(vraddhn_s16_(transmute(a), transmute(b)));
simd_shuffle16(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle16!(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
}
/// Rounding Add returning High Narrow (high half).
@ -1737,7 +1737,7 @@ pub unsafe fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> ui
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
let x: uint16x4_t = transmute(vraddhn_s32_(transmute(a), transmute(b)));
simd_shuffle8(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Rounding Add returning High Narrow (high half).
@ -1748,7 +1748,7 @@ pub unsafe fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> u
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(raddhn2))]
pub unsafe fn vraddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
let x: uint32x2_t = transmute(vraddhn_s64_(transmute(a), transmute(b)));
simd_shuffle4(r, x, [0, 1, 2, 3])
simd_shuffle4!(r, x, [0, 1, 2, 3])
}
/// Signed Add Long Pairwise.
@ -2961,7 +2961,7 @@ pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
}
/// Duplicate vector element to vector or scalar
@ -2971,7 +2971,7 @@ pub unsafe fn vget_high_s8(a: int8x16_t) -> int8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
simd_shuffle4(a, a, [4, 5, 6, 7])
simd_shuffle4!(a, a, [4, 5, 6, 7])
}
/// Duplicate vector element to vector or scalar
@ -2981,7 +2981,7 @@ pub unsafe fn vget_high_s16(a: int16x8_t) -> int16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_s32(a: int32x4_t) -> int32x2_t {
simd_shuffle2(a, a, [2, 3])
simd_shuffle2!(a, a, [2, 3])
}
/// Duplicate vector element to vector or scalar
@ -3001,7 +3001,7 @@ pub unsafe fn vget_high_s64(a: int64x2_t) -> int64x1_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
}
/// Duplicate vector element to vector or scalar
@ -3011,7 +3011,7 @@ pub unsafe fn vget_high_u8(a: uint8x16_t) -> uint8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
simd_shuffle4(a, a, [4, 5, 6, 7])
simd_shuffle4!(a, a, [4, 5, 6, 7])
}
/// Duplicate vector element to vector or scalar
@ -3021,7 +3021,7 @@ pub unsafe fn vget_high_u16(a: uint16x8_t) -> uint16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_u32(a: uint32x4_t) -> uint32x2_t {
simd_shuffle2(a, a, [2, 3])
simd_shuffle2!(a, a, [2, 3])
}
/// Duplicate vector element to vector or scalar
@ -3041,7 +3041,7 @@ pub unsafe fn vget_high_u64(a: uint64x2_t) -> uint64x1_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
simd_shuffle8(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
simd_shuffle8!(a, a, [8, 9, 10, 11, 12, 13, 14, 15])
}
/// Duplicate vector element to vector or scalar
@ -3051,7 +3051,7 @@ pub unsafe fn vget_high_p8(a: poly8x16_t) -> poly8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
simd_shuffle4(a, a, [4, 5, 6, 7])
simd_shuffle4!(a, a, [4, 5, 6, 7])
}
/// Duplicate vector element to vector or scalar
@ -3061,7 +3061,7 @@ pub unsafe fn vget_high_p16(a: poly16x8_t) -> poly16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmov"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
simd_shuffle2(a, a, [2, 3])
simd_shuffle2!(a, a, [2, 3])
}
/// Duplicate vector element to vector or scalar
@ -3071,7 +3071,7 @@ pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Duplicate vector element to vector or scalar
@ -3081,7 +3081,7 @@ pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Duplicate vector element to vector or scalar
@ -3091,7 +3091,7 @@ pub unsafe fn vget_low_s16(a: int16x8_t) -> int16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
simd_shuffle2(a, a, [0, 1])
simd_shuffle2!(a, a, [0, 1])
}
/// Duplicate vector element to vector or scalar
@ -3111,7 +3111,7 @@ pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Duplicate vector element to vector or scalar
@ -3121,7 +3121,7 @@ pub unsafe fn vget_low_u8(a: uint8x16_t) -> uint8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Duplicate vector element to vector or scalar
@ -3131,7 +3131,7 @@ pub unsafe fn vget_low_u16(a: uint16x8_t) -> uint16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
simd_shuffle2(a, a, [0, 1])
simd_shuffle2!(a, a, [0, 1])
}
/// Duplicate vector element to vector or scalar
@ -3151,7 +3151,7 @@ pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Duplicate vector element to vector or scalar
@ -3161,7 +3161,7 @@ pub unsafe fn vget_low_p8(a: poly8x16_t) -> poly8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Duplicate vector element to vector or scalar
@ -3171,7 +3171,7 @@ pub unsafe fn vget_low_p16(a: poly16x8_t) -> poly16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("ldr"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
pub unsafe fn vget_low_f32(a: float32x4_t) -> float32x2_t {
simd_shuffle2(a, a, [0, 1])
simd_shuffle2!(a, a, [0, 1])
}
/// Duplicate vector element to vector or scalar
@ -3713,7 +3713,7 @@ pub unsafe fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
}
/// Reversing vector elements (swap endianness)
@ -3723,7 +3723,7 @@ pub unsafe fn vrev16_s8(a: int8x8_t) -> int8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
}
/// Reversing vector elements (swap endianness)
@ -3733,7 +3733,7 @@ pub unsafe fn vrev16q_s8(a: int8x16_t) -> int8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
}
/// Reversing vector elements (swap endianness)
@ -3743,7 +3743,7 @@ pub unsafe fn vrev16_u8(a: uint8x8_t) -> uint8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
}
/// Reversing vector elements (swap endianness)
@ -3753,7 +3753,7 @@ pub unsafe fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
}
/// Reversing vector elements (swap endianness)
@ -3763,7 +3763,7 @@ pub unsafe fn vrev16_p8(a: poly8x8_t) -> poly8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev16.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev16))]
pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
simd_shuffle16(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
simd_shuffle16!(a, a, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
}
/// Reversing vector elements (swap endianness)
@ -3773,7 +3773,7 @@ pub unsafe fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
}
/// Reversing vector elements (swap endianness)
@ -3783,7 +3783,7 @@ pub unsafe fn vrev32_s8(a: int8x8_t) -> int8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
}
/// Reversing vector elements (swap endianness)
@ -3793,7 +3793,7 @@ pub unsafe fn vrev32q_s8(a: int8x16_t) -> int8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
}
/// Reversing vector elements (swap endianness)
@ -3803,7 +3803,7 @@ pub unsafe fn vrev32_u8(a: uint8x8_t) -> uint8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
}
/// Reversing vector elements (swap endianness)
@ -3813,7 +3813,7 @@ pub unsafe fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
simd_shuffle4(a, a, [1, 0, 3, 2])
simd_shuffle4!(a, a, [1, 0, 3, 2])
}
/// Reversing vector elements (swap endianness)
@ -3823,7 +3823,7 @@ pub unsafe fn vrev32_s16(a: int16x4_t) -> int16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
}
/// Reversing vector elements (swap endianness)
@ -3833,7 +3833,7 @@ pub unsafe fn vrev32q_s16(a: int16x8_t) -> int16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
simd_shuffle4(a, a, [1, 0, 3, 2])
simd_shuffle4!(a, a, [1, 0, 3, 2])
}
/// Reversing vector elements (swap endianness)
@ -3843,7 +3843,7 @@ pub unsafe fn vrev32_p16(a: poly16x4_t) -> poly16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
}
/// Reversing vector elements (swap endianness)
@ -3853,7 +3853,7 @@ pub unsafe fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
simd_shuffle4(a, a, [1, 0, 3, 2])
simd_shuffle4!(a, a, [1, 0, 3, 2])
}
/// Reversing vector elements (swap endianness)
@ -3863,7 +3863,7 @@ pub unsafe fn vrev32_u16(a: uint16x4_t) -> uint16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
simd_shuffle8(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
simd_shuffle8!(a, a, [1, 0, 3, 2, 5, 4, 7, 6])
}
/// Reversing vector elements (swap endianness)
@ -3873,7 +3873,7 @@ pub unsafe fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
}
/// Reversing vector elements (swap endianness)
@ -3883,7 +3883,7 @@ pub unsafe fn vrev32_p8(a: poly8x8_t) -> poly8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev32.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev32))]
pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
simd_shuffle16(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
simd_shuffle16!(a, a, [3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12])
}
/// Reversing vector elements (swap endianness)
@ -3893,7 +3893,7 @@ pub unsafe fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
}
/// Reversing vector elements (swap endianness)
@ -3903,7 +3903,7 @@ pub unsafe fn vrev64_s8(a: int8x8_t) -> int8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
}
/// Reversing vector elements (swap endianness)
@ -3913,7 +3913,7 @@ pub unsafe fn vrev64q_s8(a: int8x16_t) -> int8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
simd_shuffle4(a, a, [3, 2, 1, 0])
simd_shuffle4!(a, a, [3, 2, 1, 0])
}
/// Reversing vector elements (swap endianness)
@ -3923,7 +3923,7 @@ pub unsafe fn vrev64_s16(a: int16x4_t) -> int16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
}
/// Reversing vector elements (swap endianness)
@ -3933,7 +3933,7 @@ pub unsafe fn vrev64q_s16(a: int16x8_t) -> int16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
simd_shuffle2(a, a, [1, 0])
simd_shuffle2!(a, a, [1, 0])
}
/// Reversing vector elements (swap endianness)
@ -3943,7 +3943,7 @@ pub unsafe fn vrev64_s32(a: int32x2_t) -> int32x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
simd_shuffle4(a, a, [1, 0, 3, 2])
simd_shuffle4!(a, a, [1, 0, 3, 2])
}
/// Reversing vector elements (swap endianness)
@ -3953,7 +3953,7 @@ pub unsafe fn vrev64q_s32(a: int32x4_t) -> int32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
}
/// Reversing vector elements (swap endianness)
@ -3963,7 +3963,7 @@ pub unsafe fn vrev64_u8(a: uint8x8_t) -> uint8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
}
/// Reversing vector elements (swap endianness)
@ -3973,7 +3973,7 @@ pub unsafe fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
simd_shuffle4(a, a, [3, 2, 1, 0])
simd_shuffle4!(a, a, [3, 2, 1, 0])
}
/// Reversing vector elements (swap endianness)
@ -3983,7 +3983,7 @@ pub unsafe fn vrev64_u16(a: uint16x4_t) -> uint16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
}
/// Reversing vector elements (swap endianness)
@ -3993,7 +3993,7 @@ pub unsafe fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
simd_shuffle2(a, a, [1, 0])
simd_shuffle2!(a, a, [1, 0])
}
/// Reversing vector elements (swap endianness)
@ -4003,7 +4003,7 @@ pub unsafe fn vrev64_u32(a: uint32x2_t) -> uint32x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
simd_shuffle4(a, a, [1, 0, 3, 2])
simd_shuffle4!(a, a, [1, 0, 3, 2])
}
/// Reversing vector elements (swap endianness)
@ -4013,7 +4013,7 @@ pub unsafe fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
simd_shuffle2(a, a, [1, 0])
simd_shuffle2!(a, a, [1, 0])
}
/// Reversing vector elements (swap endianness)
@ -4023,7 +4023,7 @@ pub unsafe fn vrev64_f32(a: float32x2_t) -> float32x2_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.32"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
simd_shuffle4(a, a, [1, 0, 3, 2])
simd_shuffle4!(a, a, [1, 0, 3, 2])
}
/// Reversing vector elements (swap endianness)
@ -4033,7 +4033,7 @@ pub unsafe fn vrev64q_f32(a: float32x4_t) -> float32x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
simd_shuffle8(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
simd_shuffle8!(a, a, [7, 6, 5, 4, 3, 2, 1, 0])
}
/// Reversing vector elements (swap endianness)
@ -4043,7 +4043,7 @@ pub unsafe fn vrev64_p8(a: poly8x8_t) -> poly8x8_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.8"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
simd_shuffle16(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
simd_shuffle16!(a, a, [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8])
}
/// Reversing vector elements (swap endianness)
@ -4053,7 +4053,7 @@ pub unsafe fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
simd_shuffle4(a, a, [3, 2, 1, 0])
simd_shuffle4!(a, a, [3, 2, 1, 0])
}
/// Reversing vector elements (swap endianness)
@ -4063,7 +4063,7 @@ pub unsafe fn vrev64_p16(a: poly16x4_t) -> poly16x4_t {
#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrev64.16"))]
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rev64))]
pub unsafe fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t {
simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
simd_shuffle8!(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
}
/// Signed Add and Accumulate Long Pairwise.

View file

@ -92,3 +92,99 @@ macro_rules! types {
pub struct $name($($fields)*);
)*)
}
#[allow(unused_macros)]
macro_rules! simd_shuffle2 {
($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
struct ConstParam<$(const $imm: $ty),+>;
impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
const IDX: [u32; 2] = $idx;
}
simd_shuffle2($x, $y, ConstParam::<$($imm),+>::IDX)
}};
($x:expr, $y:expr, $idx:expr $(,)?) => {{
const IDX: [u32; 2] = $idx;
simd_shuffle2($x, $y, IDX)
}};
}
#[allow(unused_macros)]
macro_rules! simd_shuffle4 {
($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
struct ConstParam<$(const $imm: $ty),+>;
impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
const IDX: [u32; 4] = $idx;
}
simd_shuffle4($x, $y, ConstParam::<$($imm),+>::IDX)
}};
($x:expr, $y:expr, $idx:expr $(,)?) => {{
const IDX: [u32; 4] = $idx;
simd_shuffle4($x, $y, IDX)
}};
}
#[allow(unused_macros)]
macro_rules! simd_shuffle8 {
($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
struct ConstParam<$(const $imm: $ty),+>;
impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
const IDX: [u32; 8] = $idx;
}
simd_shuffle8($x, $y, ConstParam::<$($imm),+>::IDX)
}};
($x:expr, $y:expr, $idx:expr $(,)?) => {{
const IDX: [u32; 8] = $idx;
simd_shuffle8($x, $y, IDX)
}};
}
#[allow(unused_macros)]
macro_rules! simd_shuffle16 {
($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
struct ConstParam<$(const $imm: $ty),+>;
impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
const IDX: [u32; 16] = $idx;
}
simd_shuffle16($x, $y, ConstParam::<$($imm),+>::IDX)
}};
($x:expr, $y:expr, $idx:expr $(,)?) => {{
const IDX: [u32; 16] = $idx;
simd_shuffle16($x, $y, IDX)
}};
}
#[allow(unused_macros)]
macro_rules! simd_shuffle32 {
($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
struct ConstParam<$(const $imm: $ty),+>;
impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
const IDX: [u32; 32] = $idx;
}
simd_shuffle32($x, $y, ConstParam::<$($imm),+>::IDX)
}};
($x:expr, $y:expr, $idx:expr $(,)?) => {{
const IDX: [u32; 32] = $idx;
simd_shuffle32($x, $y, IDX)
}};
}
#[allow(unused_macros)]
macro_rules! simd_shuffle64 {
($x:expr, $y:expr, <$(const $imm:ident : $ty:ty),+> $idx:expr $(,)?) => {{
struct ConstParam<$(const $imm: $ty),+>;
impl<$(const $imm: $ty),+> ConstParam<$($imm),+> {
const IDX: [u32; 64] = $idx;
}
simd_shuffle64($x, $y, ConstParam::<$($imm),+>::IDX)
}};
($x:expr, $y:expr, $idx:expr $(,)?) => {{
const IDX: [u32; 64] = $idx;
simd_shuffle64($x, $y, IDX)
}};
}

View file

@ -47,10 +47,10 @@ mod sealed {
#[cfg_attr(all(test, target_endian = "big"), assert_instr(xxspltd, dm = 0x0))]
unsafe fn xxpermdi(a: i64x2, b: i64x2, dm: u8) -> i64x2 {
match dm & 0b11 {
0 => simd_shuffle2(a, b, [0b00, 0b10]),
1 => simd_shuffle2(a, b, [0b01, 0b10]),
2 => simd_shuffle2(a, b, [0b00, 0b11]),
_ => simd_shuffle2(a, b, [0b01, 0b11]),
0 => simd_shuffle2!(a, b, [0b00, 0b10]),
1 => simd_shuffle2!(a, b, [0b01, 0b10]),
2 => simd_shuffle2!(a, b, [0b00, 0b11]),
_ => simd_shuffle2!(a, b, [0b01, 0b11]),
}
}

View file

@ -118,10 +118,10 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
static_assert_imm8!(MASK);
simd_shuffle4(
simd_shuffle4!(
a,
b,
[
<const MASK: i32> [
MASK as u32 & 0b1,
((MASK as u32 >> 1) & 0b1) + 4,
((MASK as u32 >> 2) & 0b1) + 2,
@ -141,10 +141,10 @@ pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m2
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
static_assert_imm8!(MASK);
simd_shuffle8(
simd_shuffle8!(
a,
b,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11) + 8,
@ -463,10 +463,10 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
static_assert_imm4!(IMM4);
simd_shuffle4(
simd_shuffle4!(
a,
b,
[
<const IMM4: i32> [
((IMM4 as u32 >> 0) & 1) * 4 + 0,
((IMM4 as u32 >> 1) & 1) * 4 + 1,
((IMM4 as u32 >> 2) & 1) * 4 + 2,
@ -486,10 +486,10 @@ pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
static_assert_imm8!(IMM8);
simd_shuffle8(
simd_shuffle8!(
a,
b,
[
<const IMM8: i32> [
((IMM8 as u32 >> 0) & 1) * 8 + 0,
((IMM8 as u32 >> 1) & 1) * 8 + 1,
((IMM8 as u32 >> 2) & 1) * 8 + 2,
@ -930,10 +930,10 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
static_assert_imm1!(IMM1);
simd_shuffle4(
simd_shuffle4!(
a,
_mm256_undefined_ps(),
[[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
<const IMM1: i32> [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
)
}
@ -951,7 +951,7 @@ pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
static_assert_imm1!(IMM1);
simd_shuffle2(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
simd_shuffle2!(a, _mm256_undefined_pd(), <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize])
}
/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@ -967,10 +967,10 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
static_assert_imm1!(IMM1);
let dst: i64x2 = simd_shuffle2(
let dst: i64x2 = simd_shuffle2!(
a.as_i64x4(),
_mm256_undefined_si256().as_i64x4(),
[[0, 1], [2, 3]][IMM1 as usize],
<const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize],
);
transmute(dst)
}
@ -1033,10 +1033,10 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
static_assert_imm8!(IMM8);
simd_shuffle8(
simd_shuffle8!(
a,
_mm256_undefined_ps(),
[
<const IMM8: i32> [
(IMM8 as u32 >> 0) & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -1060,10 +1060,10 @@ pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
static_assert_imm8!(IMM8);
simd_shuffle4(
simd_shuffle4!(
a,
_mm_undefined_ps(),
[
<const IMM8: i32> [
(IMM8 as u32 >> 0) & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -1107,10 +1107,10 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
static_assert_imm4!(IMM4);
simd_shuffle4(
simd_shuffle4!(
a,
_mm256_undefined_pd(),
[
<const IMM4: i32> [
((IMM4 as u32 >> 0) & 1),
((IMM4 as u32 >> 1) & 1),
((IMM4 as u32 >> 2) & 1) + 2,
@ -1130,10 +1130,10 @@ pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
static_assert_imm2!(IMM2);
simd_shuffle2(
simd_shuffle2!(
a,
_mm_undefined_pd(),
[(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
<const IMM2: i32> [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
)
}
@ -1257,10 +1257,10 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
static_assert_imm1!(IMM1);
simd_shuffle8(
simd_shuffle8!(
a,
_mm256_castps128_ps256(b),
[[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
<const IMM1: i32> [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
)
}
@ -1279,10 +1279,10 @@ pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
static_assert_imm1!(IMM1);
simd_shuffle4(
simd_shuffle4!(
a,
_mm256_castpd128_pd256(b),
[[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
<const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
)
}
@ -1300,10 +1300,10 @@ pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> _
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
static_assert_imm1!(IMM1);
let dst: i64x4 = simd_shuffle4(
let dst: i64x4 = simd_shuffle4!(
a.as_i64x4(),
_mm256_castsi128_si256(b).as_i64x4(),
[[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
<const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
);
transmute(dst)
}
@ -1639,7 +1639,7 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
#[cfg_attr(test, assert_instr(vmovshdup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
simd_shuffle8(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
simd_shuffle8!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
}
/// Duplicate even-indexed single-precision (32-bit) floating-point elements
@ -1651,7 +1651,7 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vmovsldup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
}
/// Duplicate even-indexed double-precision (64-bit) floating-point elements
@ -1663,7 +1663,7 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vmovddup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
simd_shuffle4(a, a, [0, 0, 2, 2])
simd_shuffle4!(a, a, [0, 0, 2, 2])
}
/// Loads 256-bits of integer data from unaligned memory into result.
@ -1756,7 +1756,7 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vunpckhpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
simd_shuffle4(a, b, [1, 5, 3, 7])
simd_shuffle4!(a, b, [1, 5, 3, 7])
}
/// Unpacks and interleave single-precision (32-bit) floating-point elements
@ -1768,7 +1768,7 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vunpckhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
simd_shuffle8!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
}
/// Unpacks and interleave double-precision (64-bit) floating-point elements
@ -1780,7 +1780,7 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vunpcklpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
simd_shuffle4(a, b, [0, 4, 2, 6])
simd_shuffle4!(a, b, [0, 4, 2, 6])
}
/// Unpacks and interleave single-precision (32-bit) floating-point elements
@ -1792,7 +1792,7 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vunpcklps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
simd_shuffle8!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
}
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@ -2572,7 +2572,7 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Casts vector of type __m256d to type __m128d.
@ -2584,7 +2584,7 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
simd_shuffle2(a, a, [0, 1])
simd_shuffle2!(a, a, [0, 1])
}
/// Casts vector of type __m256i to type __m128i.
@ -2597,7 +2597,7 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
let a = a.as_i64x4();
let dst: i64x2 = simd_shuffle2(a, a, [0, 1]);
let dst: i64x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(dst)
}
@ -2611,8 +2611,8 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
// FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
// FIXME simd_shuffle8!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
}
/// Casts vector of type __m128d to type __m256d;
@ -2625,8 +2625,8 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
// FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
simd_shuffle4(a, a, [0, 1, 0, 0])
// FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
simd_shuffle4!(a, a, [0, 1, 0, 0])
}
/// Casts vector of type __m128i to type __m256i;
@ -2640,8 +2640,8 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
let a = a.as_i64x2();
// FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
let dst: i64x4 = simd_shuffle4(a, a, [0, 1, 0, 0]);
// FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
let dst: i64x4 = simd_shuffle4!(a, a, [0, 1, 0, 0]);
transmute(dst)
}
@ -2656,7 +2656,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Constructs a 256-bit integer vector from a 128-bit integer vector.
@ -2671,7 +2671,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
let b = _mm_setzero_si128().as_i64x2();
let dst: i64x4 = simd_shuffle4(a.as_i64x2(), b, [0, 1, 2, 3]);
let dst: i64x4 = simd_shuffle4!(a.as_i64x2(), b, [0, 1, 2, 3]);
transmute(dst)
}
@ -2687,7 +2687,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
simd_shuffle4!(a, _mm_setzero_pd(), [0, 1, 2, 3])
}
/// Returns vector of type `__m256` with undefined elements.
@ -2732,7 +2732,7 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i {
#[cfg_attr(test, assert_instr(vinsertf128))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Sets packed __m256d returned vector with the supplied values.

View file

@ -175,7 +175,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
let b = b.as_i8x32();
let r: i8x32 = match IMM8 % 16 {
0 => simd_shuffle32(
0 => simd_shuffle32!(
b,
a,
[
@ -183,7 +183,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
23, 24, 25, 26, 27, 28, 29, 30, 31,
],
),
1 => simd_shuffle32(
1 => simd_shuffle32!(
b,
a,
[
@ -191,7 +191,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
24, 25, 26, 27, 28, 29, 30, 31, 48,
],
),
2 => simd_shuffle32(
2 => simd_shuffle32!(
b,
a,
[
@ -199,7 +199,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
25, 26, 27, 28, 29, 30, 31, 48, 49,
],
),
3 => simd_shuffle32(
3 => simd_shuffle32!(
b,
a,
[
@ -207,7 +207,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
],
),
4 => simd_shuffle32(
4 => simd_shuffle32!(
b,
a,
[
@ -215,7 +215,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
],
),
5 => simd_shuffle32(
5 => simd_shuffle32!(
b,
a,
[
@ -223,7 +223,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
],
),
6 => simd_shuffle32(
6 => simd_shuffle32!(
b,
a,
[
@ -231,7 +231,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
],
),
7 => simd_shuffle32(
7 => simd_shuffle32!(
b,
a,
[
@ -239,7 +239,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
],
),
8 => simd_shuffle32(
8 => simd_shuffle32!(
b,
a,
[
@ -247,7 +247,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
],
),
9 => simd_shuffle32(
9 => simd_shuffle32!(
b,
a,
[
@ -255,7 +255,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
],
),
10 => simd_shuffle32(
10 => simd_shuffle32!(
b,
a,
[
@ -263,7 +263,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
],
),
11 => simd_shuffle32(
11 => simd_shuffle32!(
b,
a,
[
@ -271,7 +271,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
],
),
12 => simd_shuffle32(
12 => simd_shuffle32!(
b,
a,
[
@ -279,7 +279,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
],
),
13 => simd_shuffle32(
13 => simd_shuffle32!(
b,
a,
[
@ -287,7 +287,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
],
),
14 => simd_shuffle32(
14 => simd_shuffle32!(
b,
a,
[
@ -295,7 +295,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
],
),
15 => simd_shuffle32(
15 => simd_shuffle32!(
b,
a,
[
@ -370,10 +370,10 @@ pub unsafe fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128
static_assert_imm4!(IMM4);
let a = a.as_i32x4();
let b = b.as_i32x4();
let r: i32x4 = simd_shuffle4(
let r: i32x4 = simd_shuffle4!(
a,
b,
[
<const IMM4: i32> [
[0, 4, 0, 4][IMM4 as usize & 0b11],
[1, 1, 5, 5][IMM4 as usize & 0b11],
[2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
@ -395,10 +395,10 @@ pub unsafe fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
static_assert_imm8!(IMM8);
let a = a.as_i32x8();
let b = b.as_i32x8();
let r: i32x8 = simd_shuffle8(
let r: i32x8 = simd_shuffle8!(
a,
b,
[
<const IMM8: i32> [
[0, 8, 0, 8][IMM8 as usize & 0b11],
[1, 1, 9, 9][IMM8 as usize & 0b11],
[2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
@ -424,10 +424,11 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
static_assert_imm8!(IMM8);
let a = a.as_i16x16();
let b = b.as_i16x16();
let r: i16x16 = simd_shuffle16(
let r: i16x16 = simd_shuffle16!(
a,
b,
[
<const IMM8: i32> [
[0, 16, 0, 16][IMM8 as usize & 0b11],
[1, 1, 17, 17][IMM8 as usize & 0b11],
[2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
@ -470,7 +471,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
let ret = simd_shuffle16!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
transmute::<i8x16, _>(ret)
}
@ -484,7 +485,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
let ret = simd_shuffle32!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
transmute::<i8x32, _>(ret)
}
@ -500,7 +501,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
let ret = simd_shuffle4!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
transmute::<i32x4, _>(ret)
}
@ -516,7 +517,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
let ret = simd_shuffle8!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
transmute::<i32x8, _>(ret)
}
@ -530,7 +531,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
#[cfg_attr(test, assert_instr(vmovddup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
let ret = simd_shuffle2!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
transmute::<i64x2, _>(ret)
}
@ -543,7 +544,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vbroadcastsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
let ret = simd_shuffle4!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
transmute::<i64x4, _>(ret)
}
@ -556,7 +557,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
#[cfg_attr(test, assert_instr(vmovddup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
simd_shuffle2!(a, _mm_setzero_pd(), [0_u32; 2])
}
/// Broadcasts the low double-precision (64-bit) floating-point element
@ -568,7 +569,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(vbroadcastsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
simd_shuffle4!(a, _mm_setzero_pd(), [0_u32; 4])
}
// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
@ -582,7 +583,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
let ret = simd_shuffle4!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
transmute::<i64x4, _>(ret)
}
@ -595,7 +596,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
#[cfg_attr(test, assert_instr(vbroadcastss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
simd_shuffle4!(a, _mm_setzero_ps(), [0_u32; 4])
}
/// Broadcasts the low single-precision (32-bit) floating-point element
@ -607,7 +608,7 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(vbroadcastss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
simd_shuffle8!(a, _mm_setzero_ps(), [0_u32; 8])
}
/// Broadcasts the low packed 16-bit integer from a to all elements of
@ -620,7 +621,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
let ret = simd_shuffle8!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
transmute::<i16x8, _>(ret)
}
@ -634,7 +635,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
let zero = _mm_setzero_si128();
let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
let ret = simd_shuffle16!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
transmute::<i16x16, _>(ret)
}
@ -746,7 +747,7 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
let a = a.as_i16x8();
let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
let v64: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute::<i64x4, _>(simd_cast(v64))
}
@ -781,7 +782,7 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
let a = a.as_i8x16();
let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute::<i32x8, _>(simd_cast(v64))
}
@ -794,7 +795,7 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
let a = a.as_i8x16();
let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
let v32: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute::<i64x4, _>(simd_cast(v32))
}
@ -820,7 +821,7 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
let a = a.as_u16x8();
let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
let v64: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute::<i64x4, _>(simd_cast(v64))
}
@ -856,7 +857,7 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
let a = a.as_u8x16();
let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute::<i32x8, _>(simd_cast(v64))
}
@ -870,7 +871,7 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
let a = a.as_u8x16();
let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
let v32: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute::<i64x4, _>(simd_cast(v32))
}
@ -889,7 +890,7 @@ pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
static_assert_imm1!(IMM1);
let a = a.as_i64x4();
let b = _mm256_undefined_si256().as_i64x4();
let dst: i64x2 = simd_shuffle2(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
let dst: i64x2 = simd_shuffle2!(a, b, <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize]);
transmute(dst)
}
@ -1711,7 +1712,8 @@ pub unsafe fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -
static_assert_imm1!(IMM1);
let a = a.as_i64x4();
let b = _mm256_castsi128_si256(b).as_i64x4();
let dst: i64x4 = simd_shuffle4(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
let dst: i64x4 =
simd_shuffle4!(a, b, <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
transmute(dst)
}
@ -2200,10 +2202,10 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let zero = _mm256_setzero_si256().as_i64x4();
let r: i64x4 = simd_shuffle4(
let r: i64x4 = simd_shuffle4!(
a.as_i64x4(),
zero,
[
<const IMM8: i32> [
IMM8 as u32 & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -2237,10 +2239,10 @@ pub unsafe fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i)
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
static_assert_imm8!(IMM8);
simd_shuffle4(
simd_shuffle4!(
a,
_mm256_undefined_pd(),
[
<const IMM8: i32> [
IMM8 as u32 & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -2350,10 +2352,10 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(MASK);
let r: i32x8 = simd_shuffle8(
let r: i32x8 = simd_shuffle8!(
a.as_i32x8(),
a.as_i32x8(),
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
(MASK as u32 >> 4) & 0b11,
@ -2380,10 +2382,10 @@ pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i16x16();
let r: i16x16 = simd_shuffle16(
let r: i16x16 = simd_shuffle16!(
a,
a,
[
<const IMM8: i32> [
0,
1,
2,
@ -2418,10 +2420,10 @@ pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
pub unsafe fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i16x16();
let r: i16x16 = simd_shuffle16(
let r: i16x16 = simd_shuffle16!(
a,
a,
[
<const IMM8: i32> [
0 + (IMM8 as u32 & 0b11),
0 + ((IMM8 as u32 >> 2) & 0b11),
0 + ((IMM8 as u32 >> 4) & 0b11),
@ -2585,10 +2587,10 @@ pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = simd_shuffle32(
let r: i8x32 = simd_shuffle32!(
zero,
a,
[
<const IMM8: i32> [
32 - (IMM8 as u32 & 0xff),
33 - (IMM8 as u32 & 0xff),
34 - (IMM8 as u32 & 0xff),
@ -2780,7 +2782,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = match IMM8 % 16 {
0 => simd_shuffle32(
0 => simd_shuffle32!(
a,
zero,
[
@ -2788,7 +2790,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
23, 24, 25, 26, 27, 28, 29, 30, 31,
],
),
1 => simd_shuffle32(
1 => simd_shuffle32!(
a,
zero,
[
@ -2796,7 +2798,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
24, 25, 26, 27, 28, 29, 30, 31, 32,
],
),
2 => simd_shuffle32(
2 => simd_shuffle32!(
a,
zero,
[
@ -2804,7 +2806,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
25, 26, 27, 28, 29, 30, 31, 32, 32,
],
),
3 => simd_shuffle32(
3 => simd_shuffle32!(
a,
zero,
[
@ -2812,7 +2814,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
],
),
4 => simd_shuffle32(
4 => simd_shuffle32!(
a,
zero,
[
@ -2820,7 +2822,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
],
),
5 => simd_shuffle32(
5 => simd_shuffle32!(
a,
zero,
[
@ -2828,7 +2830,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
],
),
6 => simd_shuffle32(
6 => simd_shuffle32!(
a,
zero,
[
@ -2836,7 +2838,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
],
),
7 => simd_shuffle32(
7 => simd_shuffle32!(
a,
zero,
[
@ -2844,7 +2846,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
],
),
8 => simd_shuffle32(
8 => simd_shuffle32!(
a,
zero,
[
@ -2852,7 +2854,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
9 => simd_shuffle32(
9 => simd_shuffle32!(
a,
zero,
[
@ -2860,7 +2862,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
10 => simd_shuffle32(
10 => simd_shuffle32!(
a,
zero,
[
@ -2868,7 +2870,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
11 => simd_shuffle32(
11 => simd_shuffle32!(
a,
zero,
[
@ -2876,7 +2878,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
12 => simd_shuffle32(
12 => simd_shuffle32!(
a,
zero,
[
@ -2884,7 +2886,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
13 => simd_shuffle32(
13 => simd_shuffle32!(
a,
zero,
[
@ -2892,7 +2894,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
14 => simd_shuffle32(
14 => simd_shuffle32!(
a,
zero,
[
@ -2900,7 +2902,7 @@ pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
15 => simd_shuffle32(
15 => simd_shuffle32!(
a,
zero,
[
@ -3178,7 +3180,7 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
#[rustfmt::skip]
let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
8, 40, 9, 41, 10, 42, 11, 43,
12, 44, 13, 45, 14, 46, 15, 47,
24, 56, 25, 57, 26, 58, 27, 59,
@ -3231,7 +3233,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
#[rustfmt::skip]
let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
0, 32, 1, 33, 2, 34, 3, 35,
4, 36, 5, 37, 6, 38, 7, 39,
16, 48, 17, 49, 18, 50, 19, 51,
@ -3279,7 +3281,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpunpckhwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
let r: i16x16 = simd_shuffle16(
let r: i16x16 = simd_shuffle16!(
a.as_i16x16(),
b.as_i16x16(),
[4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
@ -3327,7 +3329,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpunpcklwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
let r: i16x16 = simd_shuffle16(
let r: i16x16 = simd_shuffle16!(
a.as_i16x16(),
b.as_i16x16(),
[0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
@ -3368,7 +3370,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vunpckhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
transmute(r)
}
@ -3405,7 +3407,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vunpcklps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
transmute(r)
}
@ -3442,7 +3444,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vunpckhpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
transmute(r)
}
@ -3479,7 +3481,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vunpcklpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
transmute(r)
}

View file

@ -6218,7 +6218,7 @@ pub unsafe fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
#[cfg_attr(test, assert_instr(vpbroadcastw))]
pub unsafe fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
let a = _mm512_castsi128_si512(a).as_i16x32();
let ret: i16x32 = simd_shuffle32(
let ret: i16x32 = simd_shuffle32!(
a,
a,
[
@ -6306,7 +6306,7 @@ pub unsafe fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vpbroadcastb))]
pub unsafe fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
let a = _mm512_castsi128_si512(a).as_i8x64();
let ret: i8x64 = simd_shuffle64(
let ret: i8x64 = simd_shuffle64!(
a,
a,
[
@ -6397,7 +6397,7 @@ pub unsafe fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
let a = a.as_i16x32();
let b = b.as_i16x32();
#[rustfmt::skip]
let r: i16x32 = simd_shuffle32(
let r: i16x32 = simd_shuffle32!(
a,
b,
[
@ -6508,7 +6508,7 @@ pub unsafe fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
let a = a.as_i8x64();
let b = b.as_i8x64();
#[rustfmt::skip]
let r: i8x64 = simd_shuffle64(
let r: i8x64 = simd_shuffle64!(
a,
b,
[
@ -6627,7 +6627,7 @@ pub unsafe fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
let a = a.as_i16x32();
let b = b.as_i16x32();
#[rustfmt::skip]
let r: i16x32 = simd_shuffle32(
let r: i16x32 = simd_shuffle32!(
a,
b,
[
@ -6738,7 +6738,7 @@ pub unsafe fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
let a = a.as_i8x64();
let b = b.as_i8x64();
#[rustfmt::skip]
let r: i8x64 = simd_shuffle64(
let r: i8x64 = simd_shuffle64!(
a,
b,
[
@ -7133,10 +7133,10 @@ pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
static_assert_imm8!(IMM8);
let a = a.as_i16x32();
let r: i16x32 = simd_shuffle32(
let r: i16x32 = simd_shuffle32!(
a,
a,
[
<const IMM8: i32> [
IMM8 as u32 & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -7277,10 +7277,10 @@ pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i
pub unsafe fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
static_assert_imm8!(IMM8);
let a = a.as_i16x32();
let r: i16x32 = simd_shuffle32(
let r: i16x32 = simd_shuffle32!(
a,
a,
[
<const IMM8: i32> [
0,
1,
2,
@ -8433,7 +8433,7 @@ pub unsafe fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
pub unsafe fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
let a = a.as_i16x8();
let zero = _mm_setzero_si128().as_i16x8();
let v256: i16x16 = simd_shuffle16(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
let v256: i16x16 = simd_shuffle16!(a, zero, [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]);
transmute::<i8x16, _>(simd_cast(v256))
}
@ -8875,10 +8875,10 @@ pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
static_assert_imm8!(IMM8);
let a = a.as_i8x64();
let zero = _mm512_setzero_si512().as_i8x64();
let r: i8x64 = simd_shuffle64(
let r: i8x64 = simd_shuffle64!(
zero,
a,
[
<const IMM8: i32> [
64 - (IMM8 as u32 & 0xff),
65 - (IMM8 as u32 & 0xff),
66 - (IMM8 as u32 & 0xff),
@ -8960,7 +8960,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
let a = a.as_i8x64();
let zero = _mm512_setzero_si512().as_i8x64();
let r: i8x64 = match IMM8 % 16 {
0 => simd_shuffle64(
0 => simd_shuffle64!(
a,
zero,
[
@ -8969,7 +8969,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
],
),
1 => simd_shuffle64(
1 => simd_shuffle64!(
a,
zero,
[
@ -8978,7 +8978,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
],
),
2 => simd_shuffle64(
2 => simd_shuffle64!(
a,
zero,
[
@ -8987,7 +8987,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
],
),
3 => simd_shuffle64(
3 => simd_shuffle64!(
a,
zero,
[
@ -8997,7 +8997,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
114,
],
),
4 => simd_shuffle64(
4 => simd_shuffle64!(
a,
zero,
[
@ -9007,7 +9007,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
115,
],
),
5 => simd_shuffle64(
5 => simd_shuffle64!(
a,
zero,
[
@ -9017,7 +9017,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
115, 116,
],
),
6 => simd_shuffle64(
6 => simd_shuffle64!(
a,
zero,
[
@ -9027,7 +9027,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
116, 117,
],
),
7 => simd_shuffle64(
7 => simd_shuffle64!(
a,
zero,
[
@ -9037,7 +9037,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
116, 117, 118,
],
),
8 => simd_shuffle64(
8 => simd_shuffle64!(
a,
zero,
[
@ -9047,7 +9047,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
116, 117, 118, 119,
],
),
9 => simd_shuffle64(
9 => simd_shuffle64!(
a,
zero,
[
@ -9057,7 +9057,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
117, 118, 119, 120,
],
),
10 => simd_shuffle64(
10 => simd_shuffle64!(
a,
zero,
[
@ -9067,7 +9067,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
118, 119, 120, 121,
],
),
11 => simd_shuffle64(
11 => simd_shuffle64!(
a,
zero,
[
@ -9077,7 +9077,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
117, 118, 119, 120, 121, 122,
],
),
12 => simd_shuffle64(
12 => simd_shuffle64!(
a,
zero,
[
@ -9087,7 +9087,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
118, 119, 120, 121, 122, 123,
],
),
13 => simd_shuffle64(
13 => simd_shuffle64!(
a,
zero,
[
@ -9097,7 +9097,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
119, 120, 121, 122, 123, 124,
],
),
14 => simd_shuffle64(
14 => simd_shuffle64!(
a,
zero,
[
@ -9107,7 +9107,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
120, 121, 122, 123, 124, 125,
],
),
15 => simd_shuffle64(
15 => simd_shuffle64!(
a,
zero,
[
@ -9146,7 +9146,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
let b = b.as_i8x64();
let r: i8x64 = match IMM8 % 16 {
0 => simd_shuffle64(
0 => simd_shuffle64!(
b,
a,
[
@ -9155,7 +9155,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
],
),
1 => simd_shuffle64(
1 => simd_shuffle64!(
b,
a,
[
@ -9164,7 +9164,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
],
),
2 => simd_shuffle64(
2 => simd_shuffle64!(
b,
a,
[
@ -9173,7 +9173,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
],
),
3 => simd_shuffle64(
3 => simd_shuffle64!(
b,
a,
[
@ -9183,7 +9183,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
114,
],
),
4 => simd_shuffle64(
4 => simd_shuffle64!(
b,
a,
[
@ -9193,7 +9193,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
115,
],
),
5 => simd_shuffle64(
5 => simd_shuffle64!(
b,
a,
[
@ -9203,7 +9203,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
115, 116,
],
),
6 => simd_shuffle64(
6 => simd_shuffle64!(
b,
a,
[
@ -9213,7 +9213,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
116, 117,
],
),
7 => simd_shuffle64(
7 => simd_shuffle64!(
b,
a,
[
@ -9223,7 +9223,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
116, 117, 118,
],
),
8 => simd_shuffle64(
8 => simd_shuffle64!(
b,
a,
[
@ -9233,7 +9233,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
116, 117, 118, 119,
],
),
9 => simd_shuffle64(
9 => simd_shuffle64!(
b,
a,
[
@ -9243,7 +9243,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
117, 118, 119, 120,
],
),
10 => simd_shuffle64(
10 => simd_shuffle64!(
b,
a,
[
@ -9253,7 +9253,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
118, 119, 120, 121,
],
),
11 => simd_shuffle64(
11 => simd_shuffle64!(
b,
a,
[
@ -9263,7 +9263,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
117, 118, 119, 120, 121, 122,
],
),
12 => simd_shuffle64(
12 => simd_shuffle64!(
b,
a,
[
@ -9273,7 +9273,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
118, 119, 120, 121, 122, 123,
],
),
13 => simd_shuffle64(
13 => simd_shuffle64!(
b,
a,
[
@ -9283,7 +9283,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
119, 120, 121, 122, 123, 124,
],
),
14 => simd_shuffle64(
14 => simd_shuffle64!(
b,
a,
[
@ -9293,7 +9293,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
120, 121, 122, 123, 124, 125,
],
),
15 => simd_shuffle64(
15 => simd_shuffle64!(
b,
a,
[

View file

@ -10529,7 +10529,7 @@ pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
0b11111111,
_MM_FROUND_CUR_DIRECTION,
);
simd_shuffle16(
simd_shuffle16!(
r,
_mm256_setzero_ps().as_f32x8(),
[0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -10549,7 +10549,7 @@ pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> _
k,
_MM_FROUND_CUR_DIRECTION,
);
simd_shuffle16(
simd_shuffle16!(
r,
_mm256_setzero_ps().as_f32x8(),
[0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -10644,7 +10644,7 @@ pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
let a = a.as_i8x16();
let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute::<i64x8, _>(simd_cast(v64))
}
@ -10805,7 +10805,7 @@ pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
let a = a.as_u8x16();
let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute::<i64x8, _>(simd_cast(v64))
}
@ -11628,7 +11628,7 @@ pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
let a = a.as_u32x4();
let u64: u32x2 = simd_shuffle2(a, a, [0, 1]);
let u64: u32x2 = simd_shuffle2!(a, a, [0, 1]);
transmute::<f64x2, _>(simd_cast(u64))
}
@ -11663,7 +11663,7 @@ pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
let v2 = v2.as_i32x16();
let v256: i32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
let v256: i32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute::<f64x8, _>(simd_cast(v256))
}
@ -11686,7 +11686,7 @@ pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i)
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
let v2 = v2.as_u32x16();
let v256: u32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
let v256: u32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute::<f64x8, _>(simd_cast(v256))
}
@ -19215,10 +19215,10 @@ pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
static_assert_imm8!(MASK);
simd_shuffle16(
simd_shuffle16!(
a,
a,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11),
@ -19333,10 +19333,10 @@ pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> _
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
static_assert_imm8!(MASK);
simd_shuffle8(
simd_shuffle8!(
a,
a,
[
<const MASK: i32> [
MASK as u32 & 0b1,
((MASK as u32 >> 1) & 0b1),
((MASK as u32 >> 2) & 0b1) + 2,
@ -19451,10 +19451,10 @@ pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) ->
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
static_assert_imm8!(MASK);
simd_shuffle8(
simd_shuffle8!(
a,
a,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11),
@ -19507,10 +19507,10 @@ pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m51
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(MASK);
simd_shuffle4(
simd_shuffle4!(
a,
a,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11),
@ -19559,10 +19559,10 @@ pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m25
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
static_assert_imm8!(MASK);
simd_shuffle8(
simd_shuffle8!(
a,
a,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11),
@ -19613,10 +19613,10 @@ pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d)
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
static_assert_imm8!(MASK);
simd_shuffle4(
simd_shuffle4!(
a,
a,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11),
@ -20867,10 +20867,10 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
static_assert_imm8!(MASK);
let r: i32x16 = simd_shuffle16(
let r: i32x16 = simd_shuffle16!(
a.as_i32x16(),
a.as_i32x16(),
[
<const MASK: _MM_PERM_ENUM> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
(MASK as u32 >> 4) & 0b11,
@ -21003,10 +21003,10 @@ pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
static_assert_imm8!(MASK);
simd_shuffle16(
simd_shuffle16!(
a,
b,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11) + 16,
@ -21140,10 +21140,10 @@ pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: _
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
static_assert_imm8!(MASK);
simd_shuffle8(
simd_shuffle8!(
a,
b,
[
<const MASK: i32> [
MASK as u32 & 0b1,
((MASK as u32 >> 1) & 0b1) + 8,
((MASK as u32 >> 2) & 0b1) + 2,
@ -21275,10 +21275,10 @@ pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> _
static_assert_imm8!(MASK);
let a = a.as_i32x16();
let b = b.as_i32x16();
let r: i32x16 = simd_shuffle16(
let r: i32x16 = simd_shuffle16!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b11) * 4 + 0,
(MASK as u32 & 0b11) * 4 + 1,
(MASK as u32 & 0b11) * 4 + 2,
@ -21347,10 +21347,10 @@ pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> _
static_assert_imm8!(MASK);
let a = a.as_i32x8();
let b = b.as_i32x8();
let r: i32x8 = simd_shuffle8(
let r: i32x8 = simd_shuffle8!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b1) * 4 + 0,
(MASK as u32 & 0b1) * 4 + 1,
(MASK as u32 & 0b1) * 4 + 2,
@ -21411,10 +21411,10 @@ pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> _
static_assert_imm8!(MASK);
let a = a.as_i64x8();
let b = b.as_i64x8();
let r: i64x8 = simd_shuffle8(
let r: i64x8 = simd_shuffle8!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b11) * 2 + 0,
(MASK as u32 & 0b11) * 2 + 1,
((MASK as u32 >> 2) & 0b11) * 2 + 0,
@ -21475,10 +21475,10 @@ pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> _
static_assert_imm8!(MASK);
let a = a.as_i64x4();
let b = b.as_i64x4();
let r: i64x4 = simd_shuffle4(
let r: i64x4 = simd_shuffle4!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b1) * 2 + 0,
(MASK as u32 & 0b1) * 2 + 1,
((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
@ -21535,10 +21535,10 @@ pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m
static_assert_imm8!(MASK);
let a = a.as_f32x16();
let b = b.as_f32x16();
let r: f32x16 = simd_shuffle16(
let r: f32x16 = simd_shuffle16!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b11) * 4 + 0,
(MASK as u32 & 0b11) * 4 + 1,
(MASK as u32 & 0b11) * 4 + 2,
@ -21607,10 +21607,10 @@ pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m
static_assert_imm8!(MASK);
let a = a.as_f32x8();
let b = b.as_f32x8();
let r: f32x8 = simd_shuffle8(
let r: f32x8 = simd_shuffle8!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b1) * 4 + 0,
(MASK as u32 & 0b1) * 4 + 1,
(MASK as u32 & 0b1) * 4 + 2,
@ -21671,10 +21671,10 @@ pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> _
static_assert_imm8!(MASK);
let a = a.as_f64x8();
let b = b.as_f64x8();
let r: f64x8 = simd_shuffle8(
let r: f64x8 = simd_shuffle8!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b11) * 2 + 0,
(MASK as u32 & 0b11) * 2 + 1,
((MASK as u32 >> 2) & 0b11) * 2 + 0,
@ -21735,10 +21735,10 @@ pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> _
static_assert_imm8!(MASK);
let a = a.as_f64x4();
let b = b.as_f64x4();
let r: f64x4 = simd_shuffle4(
let r: f64x4 = simd_shuffle4!(
a,
b,
[
<const MASK: i32> [
(MASK as u32 & 0b1) * 2 + 0,
(MASK as u32 & 0b1) * 2 + 1,
((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
@ -21797,10 +21797,10 @@ pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
static_assert_imm2!(IMM8);
match IMM8 & 0x3 {
0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
_ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
0 => simd_shuffle4!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
1 => simd_shuffle4!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
2 => simd_shuffle4!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
_ => simd_shuffle4!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
}
}
@ -21854,8 +21854,8 @@ pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m5
pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
static_assert_imm1!(IMM8);
match IMM8 & 0x1 {
0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
_ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
0 => simd_shuffle4!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
_ => simd_shuffle4!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
}
}
@ -21909,8 +21909,8 @@ pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m2
pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
static_assert_imm1!(IMM1);
match IMM1 {
0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
_ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
0 => simd_shuffle4!(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
_ => simd_shuffle4!(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
}
}
@ -21964,8 +21964,8 @@ pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: _
pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
static_assert_imm1!(IMM8);
match IMM8 & 0x1 {
0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
_ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
0 => simd_shuffle4!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
_ => simd_shuffle4!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
}
}
@ -22021,10 +22021,10 @@ pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i
let a = a.as_i32x16();
let undefined = _mm512_undefined_epi32().as_i32x16();
let extract: i32x4 = match IMM2 {
0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
_ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
1 => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
2 => simd_shuffle4!(a, undefined, [8, 9, 10, 11]),
_ => simd_shuffle4!(a, undefined, [12, 13, 14, 15]),
};
transmute(extract)
}
@ -22081,8 +22081,8 @@ pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i
let a = a.as_i32x8();
let undefined = _mm256_undefined_si256().as_i32x8();
let extract: i32x4 = match IMM1 {
0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
_ => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
_ => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
};
transmute(extract)
}
@ -22131,7 +22131,7 @@ pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: _
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
let r: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
let r: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
transmute(r)
}
@ -22142,7 +22142,7 @@ pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
}
@ -22153,7 +22153,7 @@ pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> _
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
let zero = _mm512_setzero_ps().as_f32x16();
transmute(simd_select_bitmask(k, mov, zero))
}
@ -22211,7 +22211,7 @@ pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
let r: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
let r: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
transmute(r)
}
@ -22222,7 +22222,7 @@ pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
}
@ -22233,7 +22233,7 @@ pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> _
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
let zero = _mm512_setzero_ps().as_f32x16();
transmute(simd_select_bitmask(k, mov, zero))
}
@ -22291,7 +22291,7 @@ pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
let r: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
let r: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
transmute(r)
}
@ -22302,7 +22302,7 @@ pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
}
@ -22313,7 +22313,7 @@ pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> _
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
let zero = _mm512_setzero_pd().as_f64x8();
transmute(simd_select_bitmask(k, mov, zero))
}
@ -22376,22 +22376,22 @@ pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m
let a = a.as_i32x16();
let b = _mm512_castsi128_si512(b).as_i32x16();
let ret: i32x16 = match IMM8 & 0b11 {
0 => simd_shuffle16(
0 => simd_shuffle16!(
a,
b,
[16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
),
1 => simd_shuffle16(
1 => simd_shuffle16!(
a,
b,
[0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
),
2 => simd_shuffle16(
2 => simd_shuffle16!(
a,
b,
[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
),
_ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
_ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
};
transmute(ret)
}
@ -22447,8 +22447,8 @@ pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m
let a = a.as_i32x8();
let b = _mm256_castsi128_si256(b).as_i32x8();
let ret: i32x8 = match IMM8 & 0b1 {
0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
};
transmute(ret)
}
@ -22506,8 +22506,8 @@ pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m
static_assert_imm1!(IMM8);
let b = _mm512_castsi256_si512(b);
match IMM8 & 0b1 {
0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
}
}
@ -22558,22 +22558,22 @@ pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m51
static_assert_imm2!(IMM8);
let b = _mm512_castps128_ps512(b);
match IMM8 & 0b11 {
0 => simd_shuffle16(
0 => simd_shuffle16!(
a,
b,
[16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
),
1 => simd_shuffle16(
1 => simd_shuffle16!(
a,
b,
[0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
),
2 => simd_shuffle16(
2 => simd_shuffle16!(
a,
b,
[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
),
_ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
_ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
}
}
@ -22627,8 +22627,8 @@ pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m25
static_assert_imm1!(IMM8);
let b = _mm256_castps128_ps256(b);
match IMM8 & 0b1 {
0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
}
}
@ -22685,8 +22685,8 @@ pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m
static_assert_imm1!(IMM8);
let b = _mm512_castpd256_pd512(b);
match IMM8 & 0b1 {
0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
}
}
@ -22736,7 +22736,7 @@ pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
let a = a.as_i32x16();
let b = b.as_i32x16();
#[rustfmt::skip]
let r: i32x16 = simd_shuffle16(
let r: i32x16 = simd_shuffle16!(
a, b,
[ 2, 18, 3, 19,
2 + 4, 18 + 4, 3 + 4, 19 + 4,
@ -22837,7 +22837,7 @@ pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
}
/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -22932,7 +22932,7 @@ pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
#[rustfmt::skip]
simd_shuffle16(
simd_shuffle16!(
a, b,
[ 2, 18, 3, 19,
2 + 4, 18 + 4, 3 + 4, 19 + 4,
@ -23017,7 +23017,7 @@ pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
}
/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23109,7 +23109,7 @@ pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
let a = a.as_i32x16();
let b = b.as_i32x16();
#[rustfmt::skip]
let r: i32x16 = simd_shuffle16(
let r: i32x16 = simd_shuffle16!(
a, b,
[ 0, 16, 1, 17,
0 + 4, 16 + 4, 1 + 4, 17 + 4,
@ -23210,7 +23210,7 @@ pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
}
/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23305,7 +23305,7 @@ pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
#[rustfmt::skip]
simd_shuffle16(a, b,
simd_shuffle16!(a, b,
[ 0, 16, 1, 17,
0 + 4, 16 + 4, 1 + 4, 17 + 4,
0 + 8, 16 + 8, 1 + 8, 17 + 8,
@ -23389,7 +23389,7 @@ pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
}
/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23477,7 +23477,7 @@ pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m1
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
simd_shuffle16(
simd_shuffle16!(
a,
_mm_set1_ps(-1.),
[0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
@ -23490,7 +23490,7 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
simd_shuffle16(
simd_shuffle16!(
a,
_mm256_set1_ps(-1.),
[0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -23503,7 +23503,7 @@ pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
simd_shuffle16(
simd_shuffle16!(
a,
_mm_set1_ps(0.),
[0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
@ -23516,7 +23516,7 @@ pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
simd_shuffle16(
simd_shuffle16!(
a,
_mm256_set1_ps(0.),
[0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
@ -23529,7 +23529,7 @@ pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23538,7 +23538,7 @@ pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
}
/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23565,7 +23565,7 @@ pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
simd_shuffle8(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
simd_shuffle8!(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
}
/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23574,7 +23574,7 @@ pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
simd_shuffle8(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
simd_shuffle8!(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
}
/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23583,7 +23583,7 @@ pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
simd_shuffle8(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
simd_shuffle8!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
}
/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23592,7 +23592,7 @@ pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
simd_shuffle8(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
simd_shuffle8!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
}
/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23601,7 +23601,7 @@ pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
simd_shuffle2(a, a, [0, 1])
simd_shuffle2!(a, a, [0, 1])
}
/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23610,7 +23610,7 @@ pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23637,7 +23637,7 @@ pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
simd_shuffle8(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
simd_shuffle8!(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
}
/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23646,7 +23646,7 @@ pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
simd_shuffle8(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
simd_shuffle8!(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
}
/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23655,7 +23655,7 @@ pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
simd_shuffle8(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
simd_shuffle8!(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
}
/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23664,7 +23664,7 @@ pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
simd_shuffle8(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
simd_shuffle8!(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
}
/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23673,7 +23673,7 @@ pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
simd_shuffle2(a, a, [0, 1])
simd_shuffle2!(a, a, [0, 1])
}
/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23682,7 +23682,7 @@ pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
simd_shuffle4(a, a, [0, 1, 2, 3])
simd_shuffle4!(a, a, [0, 1, 2, 3])
}
/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@ -23722,7 +23722,7 @@ pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
let a = _mm512_castsi128_si512(a).as_i32x16();
let ret: i32x16 = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
let ret: i32x16 = simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
transmute(ret)
}
@ -23802,7 +23802,7 @@ pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23881,7 +23881,7 @@ pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}
/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -23960,7 +23960,7 @@ pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
}
/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24016,7 +24016,7 @@ pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
let a = a.as_i32x4();
let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
let ret: i32x16 = simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
transmute(ret)
}
@ -24048,7 +24048,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
let a = a.as_i32x4();
let ret: i32x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
let ret: i32x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
transmute(ret)
}
@ -24079,7 +24079,7 @@ pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
}
/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24109,7 +24109,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
}
/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24139,7 +24139,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
}
/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24169,7 +24169,7 @@ pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
}
/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -24326,66 +24326,62 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
let b = b.as_i32x16();
let imm8: i32 = IMM8 % 16;
let r: i32x16 = match imm8 {
0 => simd_shuffle16(
0 => simd_shuffle16!(
a,
b,
[
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
],
[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,],
),
1 => simd_shuffle16(
1 => simd_shuffle16!(
a,
b,
[
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
],
[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,],
),
2 => simd_shuffle16(
2 => simd_shuffle16!(
a,
b,
[18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
),
3 => simd_shuffle16(
3 => simd_shuffle16!(
a,
b,
[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
),
4 => simd_shuffle16(
4 => simd_shuffle16!(
a,
b,
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
),
5 => simd_shuffle16(
5 => simd_shuffle16!(
a,
b,
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
),
6 => simd_shuffle16(
6 => simd_shuffle16!(
a,
b,
[22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
),
7 => simd_shuffle16(
7 => simd_shuffle16!(
a,
b,
[23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
),
8 => simd_shuffle16(
8 => simd_shuffle16!(
a,
b,
[24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
),
9 => simd_shuffle16(
9 => simd_shuffle16!(
a,
b,
[25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
),
10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
_ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
10 => simd_shuffle16!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
11 => simd_shuffle16!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
12 => simd_shuffle16!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
13 => simd_shuffle16!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
14 => simd_shuffle16!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
_ => simd_shuffle16!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
};
transmute(r)
}
@ -24439,22 +24435,22 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
let b = b.as_i32x8();
let imm8: i32 = IMM8 % 16;
let r: i32x8 = match imm8 {
0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
_ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
7 => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
8 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
9 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
10 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
11 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
12 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
13 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
14 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
_ => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
};
transmute(r)
}
@ -24508,14 +24504,14 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
let b = b.as_i32x4();
let imm8: i32 = IMM8 % 8;
let r: i32x4 = match imm8 {
0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
_ => simd_shuffle4(a, b, [3, 0, 1, 2]),
0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
5 => simd_shuffle4!(a, b, [1, 2, 3, 0]),
6 => simd_shuffle4!(a, b, [2, 3, 0, 1]),
_ => simd_shuffle4!(a, b, [3, 0, 1, 2]),
};
transmute(r)
}
@ -24567,14 +24563,14 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
static_assert_imm8!(IMM8);
let imm8: i32 = IMM8 % 8;
let r: i64x8 = match imm8 {
0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
_ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
_ => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
};
transmute(r)
}
@ -24626,14 +24622,14 @@ pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __
static_assert_imm8!(IMM8);
let imm8: i32 = IMM8 % 8;
let r: i64x4 = match imm8 {
0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
_ => simd_shuffle4(a, b, [3, 4, 5, 6]),
0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
5 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
6 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
_ => simd_shuffle4!(a, b, [3, 4, 5, 6]),
};
transmute(r)
}
@ -24685,10 +24681,10 @@ pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
static_assert_imm8!(IMM8);
let imm8: i32 = IMM8 % 4;
let r: i64x2 = match imm8 {
0 => simd_shuffle2(a, b, [2, 3]),
1 => simd_shuffle2(a, b, [3, 0]),
2 => simd_shuffle2(a, b, [0, 1]),
_ => simd_shuffle2(a, b, [1, 2]),
0 => simd_shuffle2!(a, b, [2, 3]),
1 => simd_shuffle2!(a, b, [3, 0]),
2 => simd_shuffle2!(a, b, [0, 1]),
_ => simd_shuffle2!(a, b, [1, 2]),
};
transmute(r)
}

View file

@ -350,7 +350,7 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(cmpltss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3])
simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3])
}
/// Compares the lowest `f32` of both inputs for greater than or equal. The
@ -364,7 +364,7 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(cmpless))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3])
simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3])
}
/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
@ -420,7 +420,7 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(cmpnltss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3])
simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3])
}
/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
@ -434,7 +434,7 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(cmpnless))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3])
simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3])
}
/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
@ -1011,10 +1011,10 @@ pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
static_assert_imm8!(MASK);
simd_shuffle4(
simd_shuffle4!(
a,
b,
[
<const MASK: i32> [
MASK as u32 & 0b11,
(MASK as u32 >> 2) & 0b11,
((MASK as u32 >> 4) & 0b11) + 4,
@ -1032,7 +1032,7 @@ pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(unpckhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, b, [2, 6, 3, 7])
simd_shuffle4!(a, b, [2, 6, 3, 7])
}
/// Unpacks and interleave single-precision (32-bit) floating-point elements
@ -1044,7 +1044,7 @@ pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(unpcklps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, b, [0, 4, 1, 5])
simd_shuffle4!(a, b, [0, 4, 1, 5])
}
/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
@ -1057,7 +1057,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
// TODO; figure why this is a different instruction on Windows?
simd_shuffle4(a, b, [6, 7, 2, 3])
simd_shuffle4!(a, b, [6, 7, 2, 3])
}
/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
@ -1069,7 +1069,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, b, [0, 1, 4, 5])
simd_shuffle4!(a, b, [0, 1, 4, 5])
}
/// Returns a mask of the most significant bit of each element in `a`.
@ -1201,7 +1201,7 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
let a = _mm_load_ps(p);
simd_shuffle4(a, a, [3, 2, 1, 0])
simd_shuffle4!(a, a, [3, 2, 1, 0])
}
/// Loads unaligned 64-bits of integer data from memory into new vector.
@ -1253,7 +1253,7 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]);
let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]);
*(p as *mut __m128) = b;
}
@ -1329,7 +1329,7 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]);
let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]);
*(p as *mut __m128) = b;
}
@ -1347,7 +1347,7 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
#[cfg_attr(test, assert_instr(movss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
simd_shuffle4(a, b, [4, 1, 2, 3])
simd_shuffle4!(a, b, [4, 1, 2, 3])
}
/// Performs a serializing operation on all store-to-memory instructions that

View file

@ -432,10 +432,10 @@ unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
}
}
let zero = _mm_set1_epi8(0).as_i8x16();
transmute(simd_shuffle16::<i8x16, i8x16>(
transmute::<i8x16, _>(simd_shuffle16!(
zero,
a.as_i8x16(),
[
<const IMM8: i32> [
mask(IMM8, 0),
mask(IMM8, 1),
mask(IMM8, 2),
@ -635,10 +635,10 @@ unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
}
}
let zero = _mm_set1_epi8(0).as_i8x16();
let x: i8x16 = simd_shuffle16(
let x: i8x16 = simd_shuffle16!(
a.as_i8x16(),
zero,
[
<const IMM8: i32> [
mask(IMM8, 0),
mask(IMM8, 1),
mask(IMM8, 2),
@ -895,7 +895,7 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
let a = a.as_i32x4();
simd_cast::<i32x2, __m128d>(simd_shuffle2(a, a, [0, 1]))
simd_cast::<i32x2, __m128d>(simd_shuffle2!(a, a, [0, 1]))
}
/// Returns `a` with its lower element replaced by `b` after converting it to
@ -1303,7 +1303,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
let zero = _mm_setzero_si128();
let r: i64x2 = simd_shuffle2(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
let r: i64x2 = simd_shuffle2!(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
transmute(r)
}
@ -1391,10 +1391,10 @@ pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
static_assert_imm8!(IMM8);
let a = a.as_i32x4();
let x: i32x4 = simd_shuffle4(
let x: i32x4 = simd_shuffle4!(
a,
a,
[
<const IMM8: i32> [
IMM8 as u32 & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -1419,10 +1419,10 @@ pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
static_assert_imm8!(IMM8);
let a = a.as_i16x8();
let x: i16x8 = simd_shuffle8(
let x: i16x8 = simd_shuffle8!(
a,
a,
[
<const IMM8: i32> [
0,
1,
2,
@ -1451,10 +1451,10 @@ pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
static_assert_imm8!(IMM8);
let a = a.as_i16x8();
let x: i16x8 = simd_shuffle8(
let x: i16x8 = simd_shuffle8!(
a,
a,
[
<const IMM8: i32> [
IMM8 as u32 & 0b11,
(IMM8 as u32 >> 2) & 0b11,
(IMM8 as u32 >> 4) & 0b11,
@ -1476,7 +1476,7 @@ pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(punpckhbw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
transmute::<i8x16, _>(simd_shuffle16(
transmute::<i8x16, _>(simd_shuffle16!(
a.as_i8x16(),
b.as_i8x16(),
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
@ -1491,7 +1491,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(punpckhwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
transmute::<i16x8, _>(x)
}
@ -1503,7 +1503,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(unpckhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
}
/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
@ -1514,7 +1514,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(unpckhpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [1, 3]))
transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [1, 3]))
}
/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
@ -1525,7 +1525,7 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(punpcklbw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
transmute::<i8x16, _>(simd_shuffle16(
transmute::<i8x16, _>(simd_shuffle16!(
a.as_i8x16(),
b.as_i8x16(),
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
@ -1540,7 +1540,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(punpcklwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
let x = simd_shuffle8!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
transmute::<i16x8, _>(x)
}
@ -1552,7 +1552,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(unpcklps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
transmute::<i32x4, _>(simd_shuffle4!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
}
/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
@ -1563,7 +1563,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [0, 2]))
transmute::<i64x2, _>(simd_shuffle2!(a.as_i64x2(), b.as_i64x2(), [0, 2]))
}
/// Returns a new vector with the low element of `a` replaced by the sum of the
@ -2519,7 +2519,7 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
let b: __m128d = simd_shuffle2(a, a, [0, 0]);
let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
*(mem_addr as *mut __m128d) = b;
}
@ -2533,7 +2533,7 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
let b: __m128d = simd_shuffle2(a, a, [0, 0]);
let b: __m128d = simd_shuffle2!(a, a, [0, 0]);
*(mem_addr as *mut __m128d) = b;
}
@ -2548,7 +2548,7 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
let b: __m128d = simd_shuffle2(a, a, [1, 0]);
let b: __m128d = simd_shuffle2!(a, a, [1, 0]);
*(mem_addr as *mut __m128d) = b;
}
@ -2612,7 +2612,7 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
let a = _mm_load_pd(mem_addr);
simd_shuffle2(a, a, [1, 0])
simd_shuffle2!(a, a, [1, 0])
}
/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@ -2653,7 +2653,7 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
static_assert_imm8!(MASK);
simd_shuffle2(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
simd_shuffle2!(a, b, <const MASK: i32> [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
}
/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
@ -2777,7 +2777,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
#[cfg_attr(test, assert_instr(unpckhpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
simd_shuffle2(a, b, [1, 3])
simd_shuffle2!(a, b, [1, 3])
}
/// The resulting `__m128d` element is composed by the high-order values of
@ -2792,7 +2792,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
simd_shuffle2(a, b, [0, 2])
simd_shuffle2!(a, b, [0, 2])
}
#[allow(improper_ctypes)]

View file

@ -106,7 +106,7 @@ pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(movddup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
simd_shuffle2(a, a, [0, 0])
simd_shuffle2!(a, a, [0, 0])
}
/// Loads a double-precision (64-bit) floating-point element from memory
@ -130,7 +130,7 @@ pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
#[cfg_attr(test, assert_instr(movshdup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
simd_shuffle4(a, a, [1, 1, 3, 3])
simd_shuffle4!(a, a, [1, 1, 3, 3])
}
/// Duplicate even-indexed single-precision (32-bit) floating-point elements
@ -142,7 +142,7 @@ pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(movsldup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
simd_shuffle4(a, a, [0, 0, 2, 2])
simd_shuffle4!(a, a, [0, 0, 2, 2])
}
#[allow(improper_ctypes)]

View file

@ -379,7 +379,7 @@ pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
let a = a.as_i8x16();
let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute(simd_cast::<_, i16x8>(a))
}
@ -392,7 +392,7 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
let a = a.as_i8x16();
let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute(simd_cast::<_, i32x4>(a))
}
@ -406,7 +406,7 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
let a = a.as_i8x16();
let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(simd_cast::<_, i64x2>(a))
}
@ -419,7 +419,7 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
let a = a.as_i16x8();
let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute(simd_cast::<_, i32x4>(a))
}
@ -432,7 +432,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
let a = a.as_i16x8();
let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(simd_cast::<_, i64x2>(a))
}
@ -445,7 +445,7 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
let a = a.as_i32x4();
let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(simd_cast::<_, i64x2>(a))
}
@ -458,7 +458,7 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
let a = a.as_u8x16();
let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
transmute(simd_cast::<_, i16x8>(a))
}
@ -471,7 +471,7 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
let a = a.as_u8x16();
let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute(simd_cast::<_, i32x4>(a))
}
@ -484,7 +484,7 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
let a = a.as_u8x16();
let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(simd_cast::<_, i64x2>(a))
}
@ -498,7 +498,7 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
let a = a.as_u16x8();
let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
transmute(simd_cast::<_, i32x4>(a))
}
@ -512,7 +512,7 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
let a = a.as_u16x8();
let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(simd_cast::<_, i64x2>(a))
}
@ -526,7 +526,7 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
let a = a.as_u32x4();
let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
transmute(simd_cast::<_, i64x2>(a))
}

View file

@ -113,10 +113,10 @@ pub unsafe fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
shift + i
}
}
let r: i8x16 = simd_shuffle16(
let r: i8x16 = simd_shuffle16!(
b.as_i8x16(),
a.as_i8x16(),
[
<const IMM8: i32> [
mask(IMM8 as u32, 0),
mask(IMM8 as u32, 1),
mask(IMM8 as u32, 2),

View file

@ -194,8 +194,8 @@ generate int32x2_t:int32x2_t:int64x2_t
/// Unsigned Absolute difference Long
name = vabdl_high
no-q
multi_fn = simd_shuffle8, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_cast, {vabd_u8, c, d}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
@ -207,8 +207,8 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
/// Unsigned Absolute difference Long
name = vabdl_high
no-q
multi_fn = simd_shuffle4, c:uint16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_shuffle4, d:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_cast, {vabd_u16, c, d}
a = 1, 2, 3, 4, 8, 9, 11, 12
b = 10, 10, 10, 10, 10, 10, 10, 10
@ -220,8 +220,8 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
/// Unsigned Absolute difference Long
name = vabdl_high
no-q
multi_fn = simd_shuffle2, c:uint32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2, d:uint32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
multi_fn = simd_cast, {vabd_u32, c, d}
a = 1, 2, 3, 4
b = 10, 10, 10, 10
@ -233,8 +233,8 @@ generate uint32x4_t:uint32x4_t:uint64x2_t
/// Signed Absolute difference Long
name = vabdl_high
no-q
multi_fn = simd_shuffle8, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d}
multi_fn = simd_cast, e
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@ -247,8 +247,8 @@ generate int8x16_t:int8x16_t:int16x8_t
/// Signed Absolute difference Long
name = vabdl_high
no-q
multi_fn = simd_shuffle4, c:int16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_shuffle4, d:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d}
multi_fn = simd_cast, e
a = 1, 2, 3, 4, 9, 10, 11, 12
@ -261,8 +261,8 @@ generate int16x8_t:int16x8_t:int32x4_t
/// Signed Absolute difference Long
name = vabdl_high
no-q
multi_fn = simd_shuffle2, c:int32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2, d:int32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d}
multi_fn = simd_cast, e
a = 1, 2, 3, 4
@ -727,7 +727,7 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 0:1
@ -744,7 +744,7 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
a = 1., 2., 3., 4.
b = 0., 0.5, 0., 0.
n = 0:1
@ -759,8 +759,8 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 0:1
@ -777,8 +777,8 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
a = 1., 2., 3., 4.
b = 0., 0.5, 0., 0.
n = 0:1
@ -793,8 +793,8 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 0:1
@ -811,8 +811,8 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1:0
@ -827,8 +827,8 @@ lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
a = 1., 2., 3., 4.
b = 0.5, 0., 0., 0.
n = 1:0
@ -897,7 +897,7 @@ generate float32x2_t:float64x2_t
/// Floating-point convert to higher precision long
name = vcvt_high
noq-double-suffixes
multi_fn = simd_shuffle2, b:float32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3]
multi_fn = simd_cast, b
a = -1.2, 1.2, 2.3, 3.4
validate 2.3f32 as f64, 3.4f32 as f64
@ -918,7 +918,7 @@ generate float64x2_t:float32x2_t
/// Floating-point convert to lower precision narrow
name = vcvt_high
noq-double-suffixes
multi_fn = simd_shuffle4, a, {simd_cast, b}, [0, 1, 2, 3]
multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3]
a = -1.2, 1.2
b = -2.3, 3.4
validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
@ -939,7 +939,7 @@ generate float64x2_t:float32x2_t
/// Floating-point convert to lower precision narrow, rounding to odd
name = vcvtx_high
noq-double-suffixes
multi_fn = simd_shuffle4, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
a = -1.0, 2.0
b = -3.0, 4.0
validate -1.0, 2.0, -3.0, 4.0
@ -1162,7 +1162,7 @@ name = vdup
lane-suffixes
constn = N
multi_fn = static_assert_imm-in_exp_len-N
multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
n = HFLEN
validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@ -1188,7 +1188,7 @@ name = vdup
lane-suffixes
constn = N
multi_fn = static_assert_imm-in_exp_len-N
multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
n = HFLEN
validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
@ -1202,7 +1202,7 @@ name = vdup
lane-suffixes
constn = N
multi_fn = static_assert_imm-in_exp_len-N
multi_fn = simd_shuffle-out_len-noext, a, a, {dup-out_len-N as u32}
multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
a = 1., 1., 1., 4.
n = HFLEN
validate 1., 1., 1., 1.
@ -1303,7 +1303,7 @@ generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
name = vext
constn = N
multi_fn = static_assert_imm-out_exp_len-N
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
n = HFLEN
@ -1317,7 +1317,7 @@ generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
name = vext
constn = N
multi_fn = static_assert_imm-out_exp_len-N
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
n = HFLEN
@ -1333,7 +1333,7 @@ generate int64x2_t, uint64x2_t
name = vext
constn = N
multi_fn = static_assert_imm-out_exp_len-N
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
a = 0., 2., 2., 3.
b = 3., 4., 5., 6.,
n = HFLEN
@ -1403,7 +1403,7 @@ name = vmla
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1422,7 +1422,7 @@ name = vmla
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 0., 1., 2., 3.
b = 2., 2., 2., 2.
c = 0., 3., 0., 0.
@ -1477,7 +1477,7 @@ name = vmlal_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1495,8 +1495,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
/// Signed multiply-add long
name = vmlal_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
multi_fn = vmlal-noqself-noext, a, b, c
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1510,8 +1510,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
/// Unsigned multiply-add long
name = vmlal_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
multi_fn = vmlal-noqself-noext, a, b, c
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1541,7 +1541,7 @@ name = vmlal_high_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 8, 7, 6, 5, 4, 3, 2, 1
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1613,7 +1613,7 @@ name = vmls
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1632,7 +1632,7 @@ name = vmls
in2-lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 6., 7., 8., 9.
b = 2., 2., 2., 2.
c = 0., 3., 0., 0.
@ -1687,7 +1687,7 @@ name = vmlsl_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1705,8 +1705,8 @@ generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint
/// Signed multiply-subtract long
name = vmlsl_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
multi_fn = vmlsl-noqself-noext, a, b, c
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1720,8 +1720,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:
/// Unsigned multiply-subtract long
name = vmlsl_high
no-q
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
multi_fn = vmlsl-noqself-noext, a, b, c
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
@ -1751,7 +1751,7 @@ name = vmlsl_high_lane
in2-suffix
constn = LANE
multi_fn = static_assert_imm-in2_exp_len-LANE
multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
a = 14, 15, 16, 17, 18, 19, 20, 21
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@ -1769,7 +1769,7 @@ generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint
name = vmovn_high
no-q
multi_fn = simd_cast, c:in_t0, b
multi_fn = simd_shuffle-out_len-noext, a, c, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len}
a = 0, 1, 2, 3, 2, 3, 4, 5
b = 2, 3, 4, 5, 12, 13, 14, 15
validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
@ -2070,7 +2070,7 @@ name = vmul
lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in_exp_len-LANE
multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
@ -2102,7 +2102,7 @@ name = vmul
lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in_exp_len-LANE
multi_fn = simd_mul, a, {simd_shuffle-out_len-noext, b, b, {dup-out_len-LANE as u32}}
multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
a = 1., 2., 3., 4.
b = 2., 0., 0., 0.
n = 0
@ -2155,8 +2155,8 @@ generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:i
/// Signed multiply long
name = vmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = vmull-noqself-noext, a, b
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@ -2181,8 +2181,8 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
/// Unsigned multiply long
name = vmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = vmull-noqself-noext, a, b
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
@ -2222,8 +2222,8 @@ link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t
/// Polynomial multiply long
name = vmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
multi_fn = vmull-noqself-noext, a, b
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
@ -2263,7 +2263,7 @@ generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
name = vmull_lane
constn = LANE
multi_fn = static_assert_imm-in_exp_len-LANE
multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
@ -2294,7 +2294,7 @@ generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
name = vmull_high_lane
constn = LANE
multi_fn = static_assert_imm-in_exp_len-LANE
multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1
@ -2336,7 +2336,7 @@ name = vmulx
lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in_exp_len-LANE
multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-noext, b, b, {dup-in0_len-LANE as u32}}
multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
a = 1., 2., 3., 4.
b = 2., 0., 0., 0.
n = 0
@ -2573,7 +2573,7 @@ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
name = vsubhn_high
no-q
multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
multi_fn = simd_shuffle-out_len-noext, a, d, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len}
a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
c = 1, 0, 1, 0, 1, 0, 1, 0
@ -2629,7 +2629,7 @@ generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint
/// Signed Subtract Wide
name = vsubw_high
no-q
multi_fn = simd_shuffle8, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_sub, a, {simd_cast, c}
a = 8, 9, 10, 12, 13, 14, 15, 16
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
@ -2641,7 +2641,7 @@ generate int16x8_t:int8x16_t:int16x8_t
/// Signed Subtract Wide
name = vsubw_high
no-q
multi_fn = simd_shuffle4, c:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_sub, a, {simd_cast, c}
a = 8, 9, 10, 11
b = 0, 1, 2, 3, 8, 9, 10, 11
@ -2653,7 +2653,7 @@ generate int32x4_t:int16x8_t:int32x4_t
/// Signed Subtract Wide
name = vsubw_high
no-q
multi_fn = simd_shuffle2, c:int32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3]
multi_fn = simd_sub, a, {simd_cast, c}
a = 8, 9
b = 6, 7, 8, 9
@ -2665,7 +2665,7 @@ generate int64x2_t:int32x4_t:int64x2_t
/// Unsigned Subtract Wide
name = vsubw_high
no-q
multi_fn = simd_shuffle8, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_sub, a, {simd_cast, c}
a = 8, 9, 10, 11, 12, 13, 14, 15
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@ -2677,7 +2677,7 @@ generate uint16x8_t:uint8x16_t:uint16x8_t
/// Unsigned Subtract Wide
name = vsubw_high
no-q
multi_fn = simd_shuffle4, c:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_sub, a, {simd_cast, c}
a = 8, 9, 10, 11
b = 0, 1, 2, 3, 8, 9, 10, 11
@ -2689,7 +2689,7 @@ generate uint32x4_t:uint16x8_t:uint32x4_t
/// Unsigned Subtract Wide
name = vsubw_high
no-q
multi_fn = simd_shuffle2, c:uint32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3]
multi_fn = simd_sub, a, {simd_cast, c}
a = 8, 9
b = 6, 7, 8, 9
@ -2731,9 +2731,9 @@ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint3
/// Signed Subtract Long
name = vsubl_high
no-q
multi_fn = simd_shuffle8, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_cast, d:out_t, c
multi_fn = simd_shuffle8, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_cast, f:out_t, e
multi_fn = simd_sub, d, f
@ -2747,9 +2747,9 @@ generate int8x16_t:int8x16_t:int16x8_t
/// Signed Subtract Long
name = vsubl_high
no-q
multi_fn = simd_shuffle4, c:int16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_cast, d:out_t, c
multi_fn = simd_shuffle4, e:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_cast, f:out_t, e
multi_fn = simd_sub, d, f
@ -2763,9 +2763,9 @@ generate int16x8_t:int16x8_t:int32x4_t
/// Signed Subtract Long
name = vsubl_high
no-q
multi_fn = simd_shuffle2, c:int32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
multi_fn = simd_cast, d:out_t, c
multi_fn = simd_shuffle2, e:int32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3]
multi_fn = simd_cast, f:out_t, e
multi_fn = simd_sub, d, f
@ -2779,9 +2779,9 @@ generate int32x4_t:int32x4_t:int64x2_t
/// Unsigned Subtract Long
name = vsubl_high
no-q
multi_fn = simd_shuffle8, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_cast, d:out_t, c
multi_fn = simd_shuffle8, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_cast, f:out_t, e
multi_fn = simd_sub, d, f
@ -2795,9 +2795,9 @@ generate uint8x16_t:uint8x16_t:uint16x8_t
/// Unsigned Subtract Long
name = vsubl_high
no-q
multi_fn = simd_shuffle4, c:uint16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
multi_fn = simd_cast, d:out_t, c
multi_fn = simd_shuffle4, e:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_cast, f:out_t, e
multi_fn = simd_sub, d, f
@ -2811,9 +2811,9 @@ generate uint16x8_t:uint16x8_t:uint32x4_t
/// Unsigned Subtract Long
name = vsubl_high
no-q
multi_fn = simd_shuffle2, c:uint32x2_t, a, a, [2, 3]
multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
multi_fn = simd_cast, d:out_t, c
multi_fn = simd_shuffle2, e:uint32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3]
multi_fn = simd_cast, f:out_t, e
multi_fn = simd_sub, d, f
@ -3011,8 +3011,8 @@ generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
/// Signed saturating doubling multiply long
name = vqdmull_high
no-q
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-halflen-halflen}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {asc-halflen-halflen}
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen}
multi_fn = vqdmull-noqself-noext, a, b
a = 0, 1, 4, 5, 4, 5, 6, 7
b = 1, 2, 5, 6, 5, 6, 7, 8
@ -3024,7 +3024,7 @@ generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
/// Signed saturating doubling multiply long
name = vqdmull_high_n
no-q
multi_fn = simd_shuffle-out_len-noext, a:in_ntt, a, a, {asc-out_len-out_len}
multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len}
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
multi_fn = vqdmull-in_ntt-noext, a, b
a = 0, 2, 8, 10, 8, 10, 12, 14
@ -3038,7 +3038,7 @@ generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
name = vqdmull_lane
constn = N
multi_fn = static_assert_imm-in_exp_len-N
multi_fn = simd_shuffle-out_len-noext, b:in_t0, b, b, {dup-out_len-N as u32}
multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32}
multi_fn = vqdmull-noqself-noext, a, b
a = 1, 2, 3, 4
b = 0, 2, 2, 0, 2, 0, 0, 0
@ -3083,8 +3083,8 @@ generate i32:int32x2_t:i64, i32:int32x4_t:i64
name = vqdmull_high_lane
constn = N
multi_fn = static_assert_imm-in_exp_len-N
multi_fn = simd_shuffle-out_len-noext, a:in_t, a, a, {asc-out_len-out_len}
multi_fn = simd_shuffle-out_len-noext, b:in_t, b, b, {dup-out_len-N as u32}
multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len}
multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32}
multi_fn = vqdmull-self-noext, a, b
a = 0, 1, 4, 5, 4, 5, 6, 7
b = 0, 2, 2, 0, 2, 0, 0, 0
@ -3098,8 +3098,8 @@ generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t
name = vqdmull_high_lane
constn = N
multi_fn = static_assert_imm-in_exp_len-N
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-out_len-out_len}
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {dup-out_len-N as u32}
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len}
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32}
multi_fn = vqdmull-noqself-noext, a, b
a = 0, 1, 4, 5, 4, 5, 6, 7
b = 0, 2, 2, 0, 2, 0, 0, 0
@ -3390,7 +3390,7 @@ name = vqrdmulh
lane-suffixes
constn = LANE
multi_fn = static_assert_imm-in_exp_len-LANE
multi_fn = simd_shuffle-out_len-noext, b:out_t, b, b, {dup-out_len-LANE as u32}
multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32}
multi_fn = vqrdmulh-out-noext, a, b
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
b = 0, 2, 0, 0, 0, 0, 0, 0,
@ -3616,7 +3616,7 @@ name = vqrshrn_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 2, 3, 2, 3, 6, 7
b = 8, 12, 24, 28, 48, 52, 56, 60
n = 2
@ -3662,7 +3662,7 @@ name = vqrshrn_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 2, 3, 2, 3, 6, 7
b = 8, 12, 24, 28, 48, 52, 56, 60
n = 2
@ -3708,7 +3708,7 @@ name = vqrshrun_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 2, 3, 2, 3, 6, 7
b = 8, 12, 24, 28, 48, 52, 56, 60
n = 2
@ -3858,7 +3858,7 @@ name = vqshrn_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 8, 9, 8, 9, 10, 11
b = 32, 36, 40, 44, 48, 52, 56, 60
n = 2
@ -3903,7 +3903,7 @@ name = vqshrn_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 8, 9, 8, 9, 10, 11
b = 32, 36, 40, 44, 48, 52, 56, 60
n = 2
@ -3948,7 +3948,7 @@ name = vqshrun_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 8, 9, 8, 9, 10, 11
b = 32, 36, 40, 44, 48, 52, 56, 60
n = 2
@ -4312,7 +4312,7 @@ name = vrshrn_high
noq-n-suffix
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
a = 0, 1, 8, 9, 8, 9, 10, 11
b = 32, 36, 40, 44, 48, 52, 56, 60
n = 2
@ -4542,7 +4542,7 @@ name = vshll_high_n
no-q
constn = N
multi_fn = static_assert-N-0-bits
multi_fn = simd_shuffle-out_len-noext, b:half, a, a, {asc-halflen-halflen}
multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen}
multi_fn = vshll_n-noqself-::<N>, b
a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8
n = 2
@ -4589,7 +4589,7 @@ name = vshrn_high_n
no-q
constn = N
multi_fn = static_assert-N-1-halfbits
multi_fn = simd_shuffle-out_len-noext, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
a = 1, 2, 5, 6, 5, 6, 7, 8
b = 20, 24, 28, 32, 52, 56, 60, 64
n = 2
@ -4631,7 +4631,7 @@ generate uint*_t, uint64x*_t
/// Transpose vectors
name = vtrn1
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
@ -4644,7 +4644,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
/// Transpose vectors
name = vtrn1
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
a = 0., 2., 4., 6., 8., 10., 12., 14.
b = 1., 3., 5., 7., 9., 11., 13., 15.
validate 0., 1., 4., 5., 8., 9., 12., 13.
@ -4657,7 +4657,7 @@ generate float32x2_t, float64x2_t
/// Transpose vectors
name = vtrn2
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
@ -4670,7 +4670,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
/// Transpose vectors
name = vtrn2
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
a = 0., 2., 4., 6., 8., 10., 12., 14.
b = 1., 3., 5., 7., 9., 11., 13., 15.
validate 2., 3., 6., 7., 10., 11., 14., 15.
@ -4683,7 +4683,7 @@ generate float32x2_t, float64x2_t
/// Zip vectors
name = vzip1
multi_fn = simd_shuffle-in_len-noext, a, b, {zip-1-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@ -4693,7 +4693,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4
/// Zip vectors
name = vzip1
multi_fn = simd_shuffle-in_len-noext, a, b, {zip-1-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
a = 0., 2., 4., 6., 8., 10., 12., 14.
b = 1., 3., 5., 7., 9., 11., 13., 15.
validate 0., 1., 2., 3., 4., 5., 6., 7.
@ -4703,7 +4703,7 @@ generate float32x2_t, float32x4_t, float64x2_t
/// Zip vectors
name = vzip2
multi_fn = simd_shuffle-in_len-noext, a, b, {zip-2-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30
b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31
validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -4713,7 +4713,7 @@ generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4
/// Zip vectors
name = vzip2
multi_fn = simd_shuffle-in_len-noext, a, b, {zip-2-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
a = 0., 8., 8., 10., 8., 10., 12., 14.
b = 1., 9., 9., 11., 9., 11., 13., 15.
validate 8., 9., 10., 11., 12., 13., 14., 15.
@ -4723,7 +4723,7 @@ generate float32x2_t, float32x4_t, float64x2_t
/// Unzip vectors
name = vuzp1
multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-1-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0
b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0
validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16
@ -4736,7 +4736,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
/// Unzip vectors
name = vuzp1
multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-1-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
a = 0., 8., 1., 9., 4., 12., 5., 13.
b = 1., 10., 3., 11., 6., 14., 7., 15.
validate 0., 1., 1., 3., 4., 5., 6., 7.
@ -4749,7 +4749,7 @@ generate float32x2_t, float64x2_t
/// Unzip vectors
name = vuzp2
multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-2-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24
b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32
validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32
@ -4762,7 +4762,7 @@ generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
/// Unzip vectors
name = vuzp2
multi_fn = simd_shuffle-in_len-noext, a, b, {unzip-2-in_len}
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
a = 0., 8., 1., 9., 4., 12., 5., 13.
b = 2., 9., 3., 11., 6., 14., 7., 15.
validate 8., 9., 9., 11., 12., 13., 14., 15.
@ -4793,8 +4793,8 @@ generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16
/// Unsigned Absolute difference and Accumulate Long
name = vabal_high
no-q
multi_fn = simd_shuffle8, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = vabd_u8, d, e, f:uint8x8_t
multi_fn = simd_add, a, {simd_cast, f}
a = 9, 10, 11, 12, 13, 14, 15, 16
@ -4808,8 +4808,8 @@ generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t
/// Unsigned Absolute difference and Accumulate Long
name = vabal_high
no-q
multi_fn = simd_shuffle4, d:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4, e:uint16x4_t, c, c, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7]
multi_fn = vabd_u16, d, e, f:uint16x4_t
multi_fn = simd_add, a, {simd_cast, f}
a = 9, 10, 11, 12
@ -4823,8 +4823,8 @@ generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
/// Unsigned Absolute difference and Accumulate Long
name = vabal_high
no-q
multi_fn = simd_shuffle2, d:uint32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2, e:uint32x2_t, c, c, [2, 3]
multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3]
multi_fn = vabd_u32, d, e, f:uint32x2_t
multi_fn = simd_add, a, {simd_cast, f}
a = 15, 16
@ -4884,8 +4884,8 @@ generate int64x2_t:int32x2_t:int32x2_t:int64x2_t
/// Signed Absolute difference and Accumulate Long
name = vabal_high
no-q
multi_fn = simd_shuffle8, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
multi_fn = vabd_s8, d, e, f:int8x8_t
multi_fn = simd_cast, f:uint8x8_t, f
multi_fn = simd_add, a, {simd_cast, f}
@ -4900,8 +4900,8 @@ generate int16x8_t:int8x16_t:int8x16_t:int16x8_t
/// Signed Absolute difference and Accumulate Long
name = vabal_high
no-q
multi_fn = simd_shuffle4, d:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4, e:int16x4_t, c, c, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7]
multi_fn = vabd_s16, d, e, f:int16x4_t
multi_fn = simd_cast, f:uint16x4_t, f
multi_fn = simd_add, a, {simd_cast, f}
@ -4916,8 +4916,8 @@ generate int32x4_t:int16x8_t:int16x8_t:int32x4_t
/// Signed Absolute difference and Accumulate Long
name = vabal_high
no-q
multi_fn = simd_shuffle2, d:int32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2, e:int32x2_t, c, c, [2, 3]
multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3]
multi_fn = vabd_s32, d, e, f:int32x2_t
multi_fn = simd_cast, f:uint32x2_t, f
multi_fn = simd_add, a, {simd_cast, f}

View file

@ -988,6 +988,17 @@ fn gen_aarch64(
);
}
};
let const_declare = if let Some(constn) = constn {
if constn.contains(":") {
let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
assert_eq!(constns.len(), 2);
format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
} else {
format!(r#"<const {}: i32>"#, constn)
}
} else {
String::new()
};
let multi_calls = if !multi_fn.is_empty() {
let mut calls = String::new();
for i in 0..multi_fn.len() {
@ -997,6 +1008,7 @@ fn gen_aarch64(
calls.push_str(&get_call(
&multi_fn[i],
current_name,
&const_declare,
in_t,
out_t,
fixed,
@ -1007,17 +1019,6 @@ fn gen_aarch64(
} else {
String::new()
};
let const_declare = if let Some(constn) = constn {
if constn.contains(":") {
let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
assert_eq!(constns.len(), 2);
format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
} else {
format!(r#"<const {}: i32>"#, constn)
}
} else {
String::new()
};
let const_assert = if let Some(constn) = constn {
if constn.contains(":") {
let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
@ -1582,6 +1583,11 @@ fn gen_arm(
));
}
};
let const_declare = if let Some(constn) = constn {
format!(r#"<const {}: i32>"#, constn)
} else {
String::new()
};
let multi_calls = if !multi_fn.is_empty() {
let mut calls = String::new();
for i in 0..multi_fn.len() {
@ -1591,6 +1597,7 @@ fn gen_arm(
calls.push_str(&get_call(
&multi_fn[i],
current_name,
&const_declare,
in_t,
out_t,
fixed,
@ -1601,11 +1608,6 @@ fn gen_arm(
} else {
String::new()
};
let const_declare = if let Some(constn) = constn {
format!(r#"<const {}: i32>"#, constn)
} else {
String::new()
};
let const_assert = if let Some(constn) = constn {
format!(
r#", {} = {}"#,
@ -2003,6 +2005,7 @@ fn expand_intrinsic(intr: &str, t: &str) -> String {
fn get_call(
in_str: &str,
current_name: &str,
const_declare: &str,
in_t: &[&str; 3],
out_t: &str,
fixed: &Vec<String>,
@ -2041,7 +2044,7 @@ fn get_call(
"halflen" => type_len(in_t[1]) / 2,
_ => 0,
};
let mut s = String::from("[");
let mut s = format!("{} [", const_declare);
for i in 0..len {
if i != 0 {
s.push_str(", ");
@ -2084,7 +2087,7 @@ fn get_call(
"in0_len" => type_len(in_t[0]),
_ => 0,
};
let mut s = String::from("[");
let mut s = format!("{} [", const_declare);
for i in 0..len {
if i != 0 {
s.push_str(", ");
@ -2167,7 +2170,15 @@ fn get_call(
let sub_match = format!(
" {} => {},\n",
i,
get_call(&sub_call, current_name, in_t, out_t, fixed, Some(i as i32))
get_call(
&sub_call,
current_name,
const_declare,
in_t,
out_t,
fixed,
Some(i as i32)
)
);
call.push_str(&sub_match);
}
@ -2210,6 +2221,7 @@ fn get_call(
let sub_call = get_call(
&sub_fn[1..sub_fn.len() - 1],
current_name,
const_declare,
in_t,
out_t,
fixed,