Define remaining IFMA intrinsics
This commit is contained in:
parent
a56cc86a23
commit
fcee4d8b16
2 changed files with 429 additions and 48 deletions
|
|
@ -219,28 +219,6 @@
|
|||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX512IFMA52"]</summary><p>
|
||||
|
||||
* [ ] [`_mm512_mask_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_madd52hi_epu64)
|
||||
* [ ] [`_mm512_mask_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_madd52lo_epu64)
|
||||
* [ ] [`_mm512_maskz_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_madd52hi_epu64)
|
||||
* [ ] [`_mm512_maskz_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_madd52lo_epu64)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX512IFMA52", "AVX512VL"]</summary><p>
|
||||
|
||||
* [ ] [`_mm256_mask_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_madd52hi_epu64)
|
||||
* [ ] [`_mm256_mask_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_madd52lo_epu64)
|
||||
* [ ] [`_mm256_maskz_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_madd52hi_epu64)
|
||||
* [ ] [`_mm256_maskz_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_madd52lo_epu64)
|
||||
* [ ] [`_mm_mask_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_madd52hi_epu64)
|
||||
* [ ] [`_mm_mask_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_madd52lo_epu64)
|
||||
* [ ] [`_mm_maskz_madd52hi_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_madd52hi_epu64)
|
||||
* [ ] [`_mm_maskz_madd52lo_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_madd52lo_epu64)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX512_BF16", "AVX512F"]</summary><p>
|
||||
|
||||
* [ ] [`_mm512_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use crate::core_arch::x86::*;
|
||||
use crate::intrinsics::simd::simd_select_bitmask;
|
||||
|
||||
#[cfg(test)]
|
||||
use stdarch_test::assert_instr;
|
||||
|
|
@ -9,7 +10,7 @@ use stdarch_test::assert_instr;
|
|||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488)
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512ifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -18,13 +19,53 @@ pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
|
|||
vpmadd52huq_512(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are copied
|
||||
/// from `k` when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64)
|
||||
#[target_feature(enable = "avx512ifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52huq))]
|
||||
pub unsafe fn _mm512_mask_madd52hi_epu64(
|
||||
a: __m512i,
|
||||
k: __mmask8,
|
||||
b: __m512i,
|
||||
c: __m512i,
|
||||
) -> __m512i {
|
||||
simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are zeroed
|
||||
/// out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64)
|
||||
#[target_feature(enable = "avx512ifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52huq))]
|
||||
pub unsafe fn _mm512_maskz_madd52hi_epu64(
|
||||
k: __mmask8,
|
||||
a: __m512i,
|
||||
b: __m512i,
|
||||
c: __m512i,
|
||||
) -> __m512i {
|
||||
simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52)
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512ifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -33,13 +74,53 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
|
|||
vpmadd52luq_512(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are copied
|
||||
/// from `k` when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64)
|
||||
#[target_feature(enable = "avx512ifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52luq))]
|
||||
pub unsafe fn _mm512_mask_madd52lo_epu64(
|
||||
a: __m512i,
|
||||
k: __mmask8,
|
||||
b: __m512i,
|
||||
c: __m512i,
|
||||
) -> __m512i {
|
||||
simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are zeroed
|
||||
/// out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64)
|
||||
#[target_feature(enable = "avx512ifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52luq))]
|
||||
pub unsafe fn _mm512_maskz_madd52lo_epu64(
|
||||
k: __mmask8,
|
||||
a: __m512i,
|
||||
b: __m512i,
|
||||
c: __m512i,
|
||||
) -> __m512i {
|
||||
simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485)
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -48,13 +129,53 @@ pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
|
|||
vpmadd52huq_256(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are copied
|
||||
/// from `k` when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52huq))]
|
||||
pub unsafe fn _mm256_mask_madd52hi_epu64(
|
||||
a: __m256i,
|
||||
k: __mmask8,
|
||||
b: __m256i,
|
||||
c: __m256i,
|
||||
) -> __m256i {
|
||||
simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are zeroed
|
||||
/// out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52huq))]
|
||||
pub unsafe fn _mm256_maskz_madd52hi_epu64(
|
||||
k: __mmask8,
|
||||
a: __m256i,
|
||||
b: __m256i,
|
||||
c: __m256i,
|
||||
) -> __m256i {
|
||||
simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -63,13 +184,53 @@ pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
|
|||
vpmadd52luq_256(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are copied
|
||||
/// from `k` when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52luq))]
|
||||
pub unsafe fn _mm256_mask_madd52lo_epu64(
|
||||
a: __m256i,
|
||||
k: __mmask8,
|
||||
b: __m256i,
|
||||
c: __m256i,
|
||||
) -> __m256i {
|
||||
simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are zeroed
|
||||
/// out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52luq))]
|
||||
pub unsafe fn _mm256_maskz_madd52lo_epu64(
|
||||
k: __mmask8,
|
||||
a: __m256i,
|
||||
b: __m256i,
|
||||
c: __m256i,
|
||||
) -> __m256i {
|
||||
simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -78,13 +239,43 @@ pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
|
|||
vpmadd52huq_128(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are copied
|
||||
/// from `k` when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52huq))]
|
||||
pub unsafe fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
|
||||
simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are zeroed
|
||||
/// out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52huq))]
|
||||
pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -93,6 +284,36 @@ pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
|
|||
vpmadd52luq_128(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are copied
|
||||
/// from `k` when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52luq))]
|
||||
pub unsafe fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
|
||||
simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst` using writemask `k` (elements are zeroed
|
||||
/// out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64)
|
||||
#[target_feature(enable = "avx512ifma,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpmadd52luq))]
|
||||
pub unsafe fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128())
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
|
||||
|
|
@ -116,87 +337,269 @@ mod tests {
|
|||
|
||||
use crate::core_arch::x86::*;
|
||||
|
||||
const K: __mmask8 = 0b01101101;
|
||||
|
||||
#[simd_test(enable = "avx512ifma")]
|
||||
unsafe fn test_mm512_madd52hi_epu64() {
|
||||
let mut a = _mm512_set1_epi64(10 << 40);
|
||||
let a = _mm512_set1_epi64(10 << 40);
|
||||
let b = _mm512_set1_epi64((11 << 40) + 4);
|
||||
let c = _mm512_set1_epi64((12 << 40) + 3);
|
||||
|
||||
a = _mm512_madd52hi_epu64(a, b, c);
|
||||
let actual = _mm512_madd52hi_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let expected = _mm512_set1_epi64(11030549757952);
|
||||
|
||||
assert_eq_m512i(a, expected);
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma")]
|
||||
unsafe fn test_mm512_mask_madd52hi_epu64() {
|
||||
let a = _mm512_set1_epi64(10 << 40);
|
||||
let b = _mm512_set1_epi64((11 << 40) + 4);
|
||||
let c = _mm512_set1_epi64((12 << 40) + 3);
|
||||
|
||||
let actual = _mm512_mask_madd52hi_epu64(a, K, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let mut expected = _mm512_set1_epi64(11030549757952);
|
||||
expected = _mm512_mask_blend_epi64(K, a, expected);
|
||||
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma")]
|
||||
unsafe fn test_mm512_maskz_madd52hi_epu64() {
|
||||
let a = _mm512_set1_epi64(10 << 40);
|
||||
let b = _mm512_set1_epi64((11 << 40) + 4);
|
||||
let c = _mm512_set1_epi64((12 << 40) + 3);
|
||||
|
||||
let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let mut expected = _mm512_set1_epi64(11030549757952);
|
||||
expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
|
||||
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma")]
|
||||
unsafe fn test_mm512_madd52lo_epu64() {
|
||||
let mut a = _mm512_set1_epi64(10 << 40);
|
||||
let a = _mm512_set1_epi64(10 << 40);
|
||||
let b = _mm512_set1_epi64((11 << 40) + 4);
|
||||
let c = _mm512_set1_epi64((12 << 40) + 3);
|
||||
|
||||
a = _mm512_madd52lo_epu64(a, b, c);
|
||||
let actual = _mm512_madd52lo_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let expected = _mm512_set1_epi64(100055558127628);
|
||||
|
||||
assert_eq_m512i(a, expected);
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma")]
|
||||
unsafe fn test_mm512_mask_madd52lo_epu64() {
|
||||
let a = _mm512_set1_epi64(10 << 40);
|
||||
let b = _mm512_set1_epi64((11 << 40) + 4);
|
||||
let c = _mm512_set1_epi64((12 << 40) + 3);
|
||||
|
||||
let actual = _mm512_mask_madd52lo_epu64(a, K, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let mut expected = _mm512_set1_epi64(100055558127628);
|
||||
expected = _mm512_mask_blend_epi64(K, a, expected);
|
||||
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma")]
|
||||
unsafe fn test_mm512_maskz_madd52lo_epu64() {
|
||||
let a = _mm512_set1_epi64(10 << 40);
|
||||
let b = _mm512_set1_epi64((11 << 40) + 4);
|
||||
let c = _mm512_set1_epi64((12 << 40) + 3);
|
||||
|
||||
let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let mut expected = _mm512_set1_epi64(100055558127628);
|
||||
expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected);
|
||||
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_madd52hi_epu64() {
|
||||
let mut a = _mm256_set1_epi64x(10 << 40);
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
a = _mm256_madd52hi_epu64(a, b, c);
|
||||
let actual = _mm256_madd52hi_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let expected = _mm256_set1_epi64x(11030549757952);
|
||||
|
||||
assert_eq_m256i(a, expected);
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_mask_madd52hi_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm256_mask_madd52hi_epu64(a, K, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let mut expected = _mm256_set1_epi64x(11030549757952);
|
||||
expected = _mm256_mask_blend_epi64(K, a, expected);
|
||||
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_maskz_madd52hi_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let mut expected = _mm256_set1_epi64x(11030549757952);
|
||||
expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
|
||||
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_madd52lo_epu64() {
|
||||
let mut a = _mm256_set1_epi64x(10 << 40);
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
a = _mm256_madd52lo_epu64(a, b, c);
|
||||
let actual = _mm256_madd52lo_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let expected = _mm256_set1_epi64x(100055558127628);
|
||||
|
||||
assert_eq_m256i(a, expected);
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_mask_madd52lo_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm256_mask_madd52lo_epu64(a, K, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let mut expected = _mm256_set1_epi64x(100055558127628);
|
||||
expected = _mm256_mask_blend_epi64(K, a, expected);
|
||||
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_maskz_madd52lo_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let mut expected = _mm256_set1_epi64x(100055558127628);
|
||||
expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected);
|
||||
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_madd52hi_epu64() {
|
||||
let mut a = _mm_set1_epi64x(10 << 40);
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
a = _mm_madd52hi_epu64(a, b, c);
|
||||
let actual = _mm_madd52hi_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let expected = _mm_set1_epi64x(11030549757952);
|
||||
|
||||
assert_eq_m128i(a, expected);
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_mask_madd52hi_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm_mask_madd52hi_epu64(a, K, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let mut expected = _mm_set1_epi64x(11030549757952);
|
||||
expected = _mm_mask_blend_epi64(K, a, expected);
|
||||
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_maskz_madd52hi_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm_maskz_madd52hi_epu64(K, a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let mut expected = _mm_set1_epi64x(11030549757952);
|
||||
expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
|
||||
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_madd52lo_epu64() {
|
||||
let mut a = _mm_set1_epi64x(10 << 40);
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
a = _mm_madd52hi_epu64(a, b, c);
|
||||
let actual = _mm_madd52lo_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let expected = _mm_set1_epi64x(11030549757952);
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let expected = _mm_set1_epi64x(100055558127628);
|
||||
|
||||
assert_eq_m128i(a, expected);
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_mask_madd52lo_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm_mask_madd52lo_epu64(a, K, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let mut expected = _mm_set1_epi64x(100055558127628);
|
||||
expected = _mm_mask_blend_epi64(K, a, expected);
|
||||
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_maskz_madd52lo_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm_maskz_madd52lo_epu64(K, a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let mut expected = _mm_set1_epi64x(100055558127628);
|
||||
expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected);
|
||||
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue