Add AVX512VL variants of IFMA instructions.

This commit is contained in:
Henry de Valence 2019-02-04 19:34:09 -08:00 committed by gnzlbg
parent 10e05055bf
commit ddd70f987f

View file

@ -31,6 +31,62 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
vpmadd52luq_512(a, b, c)
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
vpmadd52huq_256(a, b, c)
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
vpmadd52luq_256(a, b, c)
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
vpmadd52huq_128(a, b, c)
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
/// corresponding unsigned 64-bit integer in `a`, and store the
/// results in `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
#[inline]
#[target_feature(enable = "avx512ifma,avx512vl")]
#[cfg_attr(test, assert_instr(vpmadd52huq))]
pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
vpmadd52luq_128(a, b, c)
}
#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
@ -81,4 +137,60 @@ mod tests {
assert_eq_m512i(a, expected);
}
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm256_madd52hi_epu64() {
let mut a = _mm256_set1_epi64x(10 << 40);
let b = _mm256_set1_epi64x((11 << 40) + 4);
let c = _mm256_set1_epi64x((12 << 40) + 3);
a = _mm256_madd52hi_epu64(a, b, c);
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm256_set1_epi64x(11030549757952);
assert_eq_m256i(a, expected);
}
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm256_madd52lo_epu64() {
let mut a = _mm256_set1_epi64x(10 << 40);
let b = _mm256_set1_epi64x((11 << 40) + 4);
let c = _mm256_set1_epi64x((12 << 40) + 3);
a = _mm256_madd52lo_epu64(a, b, c);
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
let expected = _mm256_set1_epi64x(100055558127628);
assert_eq_m256i(a, expected);
}
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm_madd52hi_epu64() {
let mut a = _mm_set1_epi64x(10 << 40);
let b = _mm_set1_epi64x((11 << 40) + 4);
let c = _mm_set1_epi64x((12 << 40) + 3);
a = _mm_madd52hi_epu64(a, b, c);
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm_set1_epi64x(11030549757952);
assert_eq_m128i(a, expected);
}
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm_madd52lo_epu64() {
let mut a = _mm_set1_epi64x(10 << 40);
let b = _mm_set1_epi64x((11 << 40) + 4);
let c = _mm_set1_epi64x((12 << 40) + 3);
a = _mm_madd52hi_epu64(a, b, c);
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
let expected = _mm_set1_epi64x(11030549757952);
assert_eq_m128i(a, expected);
}
}