Convert the last avx512f and avx512vpclmulqdq intrinsics (#1068)
This commit is contained in:
parent
90e778e408
commit
f0a9100c7f
4 changed files with 104 additions and 116 deletions
|
|
@ -21896,57 +21896,57 @@ pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m2
|
|||
transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
|
||||
}
|
||||
|
||||
/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the result in dst.
|
||||
/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=2473)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextractf64x4, imm8 = 1) //should be vextracti64x4
|
||||
assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
|
||||
)]
|
||||
#[rustc_args_required_const(1)]
|
||||
pub unsafe fn _mm512_extracti64x4_epi64(a: __m512i, imm8: i32) -> __m256i {
|
||||
assert!(imm8 >= 0 && imm8 <= 1);
|
||||
match imm8 & 0x1 {
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
|
||||
static_assert_imm1!(IMM1);
|
||||
match IMM1 {
|
||||
0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
|
||||
_ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti64x4_epi64&expand=2474)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextracti64x4, IMM8 = 1)
|
||||
assert_instr(vextracti64x4, IMM1 = 1)
|
||||
)]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn _mm512_mask_extracti64x4_epi64<const IMM8: i32>(
|
||||
pub unsafe fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
|
||||
src: __m256i,
|
||||
k: __mmask8,
|
||||
a: __m512i,
|
||||
) -> __m256i {
|
||||
static_assert_imm1!(IMM8);
|
||||
let r = _mm512_extracti64x4_epi64(a, IMM8);
|
||||
static_assert_imm1!(IMM1);
|
||||
let r = _mm512_extracti64x4_epi64::<IMM1>(a);
|
||||
transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
|
||||
}
|
||||
|
||||
/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti64x4_epi64&expand=2475)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextracti64x4, IMM8 = 1)
|
||||
assert_instr(vextracti64x4, IMM1 = 1)
|
||||
)]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
|
||||
static_assert_imm1!(IMM8);
|
||||
let r = _mm512_extracti64x4_epi64(a, IMM8);
|
||||
pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
|
||||
static_assert_imm1!(IMM1);
|
||||
let r = _mm512_extracti64x4_epi64::<IMM1>(a);
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
|
||||
}
|
||||
|
|
@ -22006,21 +22006,21 @@ pub unsafe fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m5
|
|||
transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
|
||||
}
|
||||
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=2461)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextractf32x4, imm8 = 3) //should be vextracti32x4
|
||||
assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
|
||||
)]
|
||||
#[rustc_args_required_const(1)]
|
||||
pub unsafe fn _mm512_extracti32x4_epi32(a: __m512i, imm8: i32) -> __m128i {
|
||||
assert!(imm8 >= 0 && imm8 <= 3);
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
|
||||
static_assert_imm2!(IMM2);
|
||||
let a = a.as_i32x16();
|
||||
let undefined = _mm512_undefined_epi32().as_i32x16();
|
||||
let extract: i32x4 = match imm8 & 0x3 {
|
||||
let extract: i32x4 = match IMM2 {
|
||||
0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
|
||||
1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
|
||||
2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
|
||||
|
|
@ -22029,97 +22029,97 @@ pub unsafe fn _mm512_extracti32x4_epi32(a: __m512i, imm8: i32) -> __m128i {
|
|||
transmute(extract)
|
||||
}
|
||||
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti32x4_epi32&expand=2462)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextracti32x4, IMM8 = 3)
|
||||
assert_instr(vextracti32x4, IMM2 = 3)
|
||||
)]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn _mm512_mask_extracti32x4_epi32<const IMM8: i32>(
|
||||
pub unsafe fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
|
||||
src: __m128i,
|
||||
k: __mmask8,
|
||||
a: __m512i,
|
||||
) -> __m128i {
|
||||
static_assert_imm2!(IMM8);
|
||||
let r = _mm512_extracti32x4_epi32(a, IMM8);
|
||||
static_assert_imm2!(IMM2);
|
||||
let r = _mm512_extracti32x4_epi32::<IMM2>(a);
|
||||
transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti32x4_epi32&expand=2463)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextracti32x4, IMM8 = 3)
|
||||
assert_instr(vextracti32x4, IMM2 = 3)
|
||||
)]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
pub unsafe fn _mm512_maskz_extracti32x4_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
|
||||
static_assert_imm2!(IMM8);
|
||||
let r = _mm512_extracti32x4_epi32(a, IMM8);
|
||||
pub unsafe fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
|
||||
static_assert_imm2!(IMM2);
|
||||
let r = _mm512_extracti32x4_epi32::<IMM2>(a);
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
|
||||
}
|
||||
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti32x4_epi32&expand=2458)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextract, imm8 = 1) //should be vextracti32x4
|
||||
assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
|
||||
)]
|
||||
#[rustc_args_required_const(1)]
|
||||
pub unsafe fn _mm256_extracti32x4_epi32(a: __m256i, imm8: i32) -> __m128i {
|
||||
assert!(imm8 >= 0 && imm8 <= 1);
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
|
||||
static_assert_imm1!(IMM1);
|
||||
let a = a.as_i32x8();
|
||||
let undefined = _mm256_undefined_si256().as_i32x8();
|
||||
let extract: i32x4 = match imm8 & 0x1 {
|
||||
let extract: i32x4 = match IMM1 {
|
||||
0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
|
||||
_ => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
|
||||
};
|
||||
transmute(extract)
|
||||
}
|
||||
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_extracti32x4_epi32&expand=2459)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextracti32x4, IMM8 = 1)
|
||||
assert_instr(vextracti32x4, IMM1 = 1)
|
||||
)]
|
||||
#[rustc_legacy_const_generics(3)]
|
||||
pub unsafe fn _mm256_mask_extracti32x4_epi32<const IMM8: i32>(
|
||||
pub unsafe fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
|
||||
src: __m128i,
|
||||
k: __mmask8,
|
||||
a: __m256i,
|
||||
) -> __m128i {
|
||||
static_assert_imm1!(IMM8);
|
||||
let r = _mm256_extracti32x4_epi32(a, IMM8);
|
||||
static_assert_imm1!(IMM1);
|
||||
let r = _mm256_extracti32x4_epi32::<IMM1>(a);
|
||||
transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_extracti32x4_epi32&expand=2460)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(
|
||||
all(test, not(target_os = "windows")),
|
||||
assert_instr(vextracti32x4, IMM8 = 1)
|
||||
assert_instr(vextracti32x4, IMM1 = 1)
|
||||
)]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
|
||||
static_assert_imm1!(IMM8);
|
||||
let r = _mm256_extracti32x4_epi32(a, IMM8);
|
||||
pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
|
||||
static_assert_imm1!(IMM1);
|
||||
let r = _mm256_extracti32x4_epi32::<IMM1>(a);
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
|
||||
}
|
||||
|
|
@ -46698,7 +46698,7 @@ mod tests {
|
|||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_extracti32x4_epi32() {
|
||||
let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let r = _mm512_extracti32x4_epi32(a, 0b1);
|
||||
let r = _mm512_extracti32x4_epi32::<1>(a);
|
||||
let e = _mm_setr_epi32(5, 6, 7, 8);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
|
@ -46727,7 +46727,7 @@ mod tests {
|
|||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm256_extracti32x4_epi32() {
|
||||
let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let r = _mm256_extracti32x4_epi32(a, 0b1);
|
||||
let r = _mm256_extracti32x4_epi32::<1>(a);
|
||||
let e = _mm_set_epi32(1, 2, 3, 4);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -262,24 +262,24 @@ mod tests {
|
|||
0x19BE9F660038BDB5,
|
||||
);
|
||||
let mut a_decomp = [_mm_setzero_si128(); 4];
|
||||
a_decomp[0] = _mm512_extracti32x4_epi32(a, 0);
|
||||
a_decomp[1] = _mm512_extracti32x4_epi32(a, 1);
|
||||
a_decomp[2] = _mm512_extracti32x4_epi32(a, 2);
|
||||
a_decomp[3] = _mm512_extracti32x4_epi32(a, 3);
|
||||
a_decomp[0] = _mm512_extracti32x4_epi32::<0>(a);
|
||||
a_decomp[1] = _mm512_extracti32x4_epi32::<1>(a);
|
||||
a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);
|
||||
a_decomp[3] = _mm512_extracti32x4_epi32::<3>(a);
|
||||
let mut k_decomp = [_mm_setzero_si128(); 4];
|
||||
k_decomp[0] = _mm512_extracti32x4_epi32(k, 0);
|
||||
k_decomp[1] = _mm512_extracti32x4_epi32(k, 1);
|
||||
k_decomp[2] = _mm512_extracti32x4_epi32(k, 2);
|
||||
k_decomp[3] = _mm512_extracti32x4_epi32(k, 3);
|
||||
k_decomp[0] = _mm512_extracti32x4_epi32::<0>(k);
|
||||
k_decomp[1] = _mm512_extracti32x4_epi32::<1>(k);
|
||||
k_decomp[2] = _mm512_extracti32x4_epi32::<2>(k);
|
||||
k_decomp[3] = _mm512_extracti32x4_epi32::<3>(k);
|
||||
let r = vectorized(a, k);
|
||||
let mut e_decomp = [_mm_setzero_si128(); 4];
|
||||
for i in 0..4 {
|
||||
e_decomp[i] = linear(a_decomp[i], k_decomp[i]);
|
||||
}
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32(r, 0), e_decomp[0]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32(r, 1), e_decomp[1]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32(r, 2), e_decomp[2]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32(r, 3), e_decomp[3]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32::<0>(r), e_decomp[0]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32::<1>(r), e_decomp[1]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32::<2>(r), e_decomp[2]);
|
||||
assert_eq_m128i(_mm512_extracti32x4_epi32::<3>(r), e_decomp[3]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vaes,avx512f")]
|
||||
|
|
|
|||
|
|
@ -34,15 +34,11 @@ extern "C" {
|
|||
#[inline]
|
||||
#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
|
||||
// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
|
||||
#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
|
||||
#[rustc_args_required_const(2)]
|
||||
pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
pclmulqdq_512(a, b, $imm8)
|
||||
};
|
||||
}
|
||||
constify_imm8!(imm8, call)
|
||||
#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
pub unsafe fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
|
||||
static_assert_imm8!(IMM8);
|
||||
pclmulqdq_512(a, b, IMM8 as u8)
|
||||
}
|
||||
|
||||
/// Performs a carry-less multiplication of two 64-bit polynomials over the
|
||||
|
|
@ -55,15 +51,11 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
|
|||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
|
||||
#[rustc_args_required_const(2)]
|
||||
pub unsafe fn _mm256_clmulepi64_epi128(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => {
|
||||
pclmulqdq_256(a, b, $imm8)
|
||||
};
|
||||
}
|
||||
constify_imm8!(imm8, call)
|
||||
#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
pub unsafe fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
|
||||
static_assert_imm8!(IMM8);
|
||||
pclmulqdq_256(a, b, IMM8 as u8)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -93,37 +85,33 @@ mod tests {
|
|||
let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
|
||||
let r11 = $broadcast(r11);
|
||||
|
||||
$assert($clmul(a, b, 0x00), r00);
|
||||
$assert($clmul(a, b, 0x10), r01);
|
||||
$assert($clmul(a, b, 0x01), r10);
|
||||
$assert($clmul(a, b, 0x11), r11);
|
||||
$assert($clmul::<0x00>(a, b), r00);
|
||||
$assert($clmul::<0x10>(a, b), r01);
|
||||
$assert($clmul::<0x01>(a, b), r10);
|
||||
$assert($clmul::<0x11>(a, b), r11);
|
||||
|
||||
let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
|
||||
let a0 = $broadcast(a0);
|
||||
let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
|
||||
let r = $broadcast(r);
|
||||
$assert($clmul(a0, a0, 0x00), r);
|
||||
$assert($clmul::<0x00>(a0, a0), r);
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! unroll {
|
||||
($target:ident[4] = $op:ident($source:ident,4);) => {
|
||||
$target[3] = $op($source, 3);
|
||||
$target[2] = $op($source, 2);
|
||||
unroll! {$target[2] = $op($source,2);}
|
||||
($target:ident[4] = $op:ident::<4>($source:ident);) => {
|
||||
$target[3] = $op::<3>($source);
|
||||
$target[2] = $op::<2>($source);
|
||||
unroll! {$target[2] = $op::<2>($source);}
|
||||
};
|
||||
($target:ident[2] = $op:ident($source:ident,2);) => {
|
||||
$target[1] = $op($source, 1);
|
||||
$target[0] = $op($source, 0);
|
||||
($target:ident[2] = $op:ident::<2>($source:ident);) => {
|
||||
$target[1] = $op::<1>($source);
|
||||
$target[0] = $op::<0>($source);
|
||||
};
|
||||
(assert_eq_m128i($op:ident($vec_res:ident,4),$lin_res:ident[4]);) => {
|
||||
assert_eq_m128i($op($vec_res, 3), $lin_res[3]);
|
||||
assert_eq_m128i($op($vec_res, 2), $lin_res[2]);
|
||||
unroll! {assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
|
||||
};
|
||||
(assert_eq_m128i($op:ident($vec_res:ident,2),$lin_res:ident[2]);) => {
|
||||
assert_eq_m128i($op($vec_res, 1), $lin_res[1]);
|
||||
assert_eq_m128i($op($vec_res, 0), $lin_res[0]);
|
||||
(assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
|
||||
assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
|
||||
assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
|
||||
unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
|
||||
};
|
||||
(assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
|
||||
assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
|
||||
|
|
@ -160,16 +148,16 @@ mod tests {
|
|||
);
|
||||
|
||||
let mut a_decomp = [_mm_setzero_si128(); 4];
|
||||
unroll! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);}
|
||||
unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
|
||||
let mut b_decomp = [_mm_setzero_si128(); 4];
|
||||
unroll! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);}
|
||||
unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
|
||||
|
||||
let r = vectorized(a, b);
|
||||
let mut e_decomp = [_mm_setzero_si128(); 4];
|
||||
for i in 0..4 {
|
||||
e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
|
||||
}
|
||||
unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
|
||||
unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
|
||||
}
|
||||
|
||||
// this function tests one of the possible 4 instances
|
||||
|
|
@ -201,13 +189,13 @@ mod tests {
|
|||
);
|
||||
|
||||
let mut a_decomp = [_mm_setzero_si128(); 2];
|
||||
unroll! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);}
|
||||
unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
|
||||
let mut b_decomp = [_mm_setzero_si128(); 2];
|
||||
unroll! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);}
|
||||
unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
|
||||
|
||||
let r = vectorized(
|
||||
_mm512_extracti64x4_epi64(a, 0),
|
||||
_mm512_extracti64x4_epi64(b, 0),
|
||||
_mm512_extracti64x4_epi64::<0>(a),
|
||||
_mm512_extracti64x4_epi64::<0>(b),
|
||||
);
|
||||
let mut e_decomp = [_mm_setzero_si128(); 2];
|
||||
for i in 0..2 {
|
||||
|
|
@ -226,19 +214,19 @@ mod tests {
|
|||
|
||||
verify_512_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x00>(a, b),
|
||||
|a, b| _mm512_clmulepi64_epi128(a, b, 0x00),
|
||||
|a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
|
||||
);
|
||||
verify_512_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x01>(a, b),
|
||||
|a, b| _mm512_clmulepi64_epi128(a, b, 0x01),
|
||||
|a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
|
||||
);
|
||||
verify_512_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x10>(a, b),
|
||||
|a, b| _mm512_clmulepi64_epi128(a, b, 0x10),
|
||||
|a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
|
||||
);
|
||||
verify_512_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x11>(a, b),
|
||||
|a, b| _mm512_clmulepi64_epi128(a, b, 0x11),
|
||||
|a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -252,19 +240,19 @@ mod tests {
|
|||
|
||||
verify_256_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x00>(a, b),
|
||||
|a, b| _mm256_clmulepi64_epi128(a, b, 0x00),
|
||||
|a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
|
||||
);
|
||||
verify_256_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x01>(a, b),
|
||||
|a, b| _mm256_clmulepi64_epi128(a, b, 0x01),
|
||||
|a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
|
||||
);
|
||||
verify_256_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x10>(a, b),
|
||||
|a, b| _mm256_clmulepi64_epi128(a, b, 0x10),
|
||||
|a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
|
||||
);
|
||||
verify_256_helper(
|
||||
|a, b| _mm_clmulepi64_si128::<0x11>(a, b),
|
||||
|a, b| _mm256_clmulepi64_epi128(a, b, 0x11),
|
||||
|a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11182,7 +11182,7 @@ mod tests {
|
|||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_extracti64x4_epi64() {
|
||||
let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let r = _mm512_extracti64x4_epi64(a, 0x1);
|
||||
let r = _mm512_extracti64x4_epi64::<0x1>(a);
|
||||
let e = _mm256_setr_epi64x(5, 6, 7, 8);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue