From d31615425590f168ae6d821215b207d08a02cc36 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sun, 9 Jun 2024 01:00:01 +0530 Subject: [PATCH] AVX512DQ Part 1: Logical Operations (and, andn, or, xor) --- library/stdarch/crates/core_arch/avx512dq.md | 73 ++- .../crates/core_arch/src/x86/avx512dq.rs | 515 ++++++++++++++++++ 2 files changed, 551 insertions(+), 37 deletions(-) create mode 100644 library/stdarch/crates/core_arch/src/x86/avx512dq.rs diff --git a/library/stdarch/crates/core_arch/avx512dq.md b/library/stdarch/crates/core_arch/avx512dq.md index 00969026d039..a51a3d833301 100644 --- a/library/stdarch/crates/core_arch/avx512dq.md +++ b/library/stdarch/crates/core_arch/avx512dq.md @@ -402,41 +402,40 @@ - Range - * [ ] _mm512_reduce_round_pd - * [ ] _mm512_mask_reduce_round_pd - * [ ] _mm512_maskz_reduce_round_pd - * [ ] _mm_reduce_pd - * [ ] _mm_mask_reduce_pd - * [ ] _mm_maskz_reduce_pd - * [ ] _mm256_reduce_pd - * [ ] _mm256_mask_reduce_pd - * [ ] _mm256_maskz_reduce_pd - * [ ] _mm512_reduce_pd - * [ ] _mm512_mask_reduce_pd - * [ ] _mm512_maskz_reduce_pd - * [ ] _mm512_reduce_round_ps - * [ ] _mm512_mask_reduce_round_ps - * [ ] _mm512_maskz_reduce_round_ps - * [ ] _mm_reduce_ps - * [ ] _mm_mask_reduce_ps - * [ ] _mm_maskz_reduce_ps - * [ ] _mm256_reduce_ps - * [ ] _mm256_mask_reduce_ps - * [ ] _mm256_maskz_reduce_ps - * [ ] _mm512_reduce_ps - * [ ] _mm512_mask_reduce_ps - * [ ] _mm512_maskz_reduce_ps - * [ ] _mm_reduce_round_sd - * [ ] _mm_mask_reduce_round_sd - * [ ] _mm_maskz_reduce_round_sd - * [ ] _mm_reduce_sd - * [ ] _mm_mask_reduce_sd - * [ ] _mm_maskz_reduce_sd - * [ ] _mm_reduce_round_ss - * [ ] _mm_mask_reduce_round_ss - * [ ] _mm_maskz_reduce_round_ss - * [ ] _mm_reduce_ss - * [ ] _mm_mask_reduce_ss - * [ ] _mm_maskz_reduce_ss - + * [ ] _mm512_reduce_round_pd + * [ ] _mm512_mask_reduce_round_pd + * [ ] _mm512_maskz_reduce_round_pd + * [ ] _mm_reduce_pd + * [ ] _mm_mask_reduce_pd + * [ ] _mm_maskz_reduce_pd + * [ ] _mm256_reduce_pd + * [ ] _mm256_mask_reduce_pd + * [ ] _mm256_maskz_reduce_pd + * [ ] _mm512_reduce_pd + * [ ] _mm512_mask_reduce_pd + * [ ] _mm512_maskz_reduce_pd + * [ ] _mm512_reduce_round_ps + * [ ] _mm512_mask_reduce_round_ps + * [ ] _mm512_maskz_reduce_round_ps + * [ ] _mm_reduce_ps + * [ ] _mm_mask_reduce_ps + * [ ] _mm_maskz_reduce_ps + * [ ] _mm256_reduce_ps + * [ ] _mm256_mask_reduce_ps + * [ ] _mm256_maskz_reduce_ps + * [ ] _mm512_reduce_ps + * [ ] _mm512_mask_reduce_ps + * [ ] _mm512_maskz_reduce_ps + * [ ] _mm_reduce_round_sd + * [ ] _mm_mask_reduce_round_sd + * [ ] _mm_maskz_reduce_round_sd + * [ ] _mm_reduce_sd + * [ ] _mm_mask_reduce_sd + * [ ] _mm_maskz_reduce_sd + * [ ] _mm_reduce_round_ss + * [ ] _mm_mask_reduce_round_ss + * [ ] _mm_maskz_reduce_round_ss + * [ ] _mm_reduce_ss + * [ ] _mm_mask_reduce_ss + * [ ] _mm_maskz_reduce_ss

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs new file mode 100644 index 000000000000..ba481ad42b11 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs @@ -0,0 +1,515 @@ +use crate::{ + core_arch::{simd::*, x86::*}, + intrinsics::simd::*, + mem::{self, transmute}, + ptr, +}; + +// And // + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let and = _mm_and_pd(a, b).as_f64x2(); + transmute(simd_select_bitmask(k, and, src.as_f64x2())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let and = _mm_and_pd(a, b).as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + transmute(simd_select_bitmask(k, and, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let and = _mm256_and_pd(a, b).as_f64x4(); + transmute(simd_select_bitmask(k, and, src.as_f64x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let and = _mm256_and_pd(a, b).as_f64x4(); + let zero = _mm256_setzero_pd().as_f64x4(); + transmute(simd_select_bitmask(k, and, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d { + transmute(simd_and(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let and = _mm512_and_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, and, src.as_f64x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandps))] +// FIXME: should be `vandpd` instruction. +pub unsafe fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let and = _mm512_and_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, and, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let and = _mm_and_ps(a, b).as_f32x4(); + transmute(simd_select_bitmask(k, and, src.as_f32x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let and = _mm_and_ps(a, b).as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + transmute(simd_select_bitmask(k, and, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 { + let and = _mm256_and_ps(a, b).as_f32x8(); + transmute(simd_select_bitmask(k, and, src.as_f32x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 { + let and = _mm256_and_ps(a, b).as_f32x8(); + let zero = _mm256_setzero_ps().as_f32x8(); + transmute(simd_select_bitmask(k, and, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 { + transmute(simd_and( + transmute::<_, u32x16>(a), + transmute::<_, u32x16>(b), + )) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm512_mask_and_ps(src: __m512, k: __mmask8, a: __m512, b: __m512) -> __m512 { + let and = _mm512_and_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, and, src.as_f32x16())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandps))] +pub unsafe fn _mm512_maskz_and_ps(k: __mmask8, a: __m512, b: __m512) -> __m512 { + let and = _mm512_and_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, and, zero)) +} + +// Andnot + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let andnot = _mm_andnot_pd(a, b).as_f64x2(); + transmute(simd_select_bitmask(k, andnot, src.as_f64x2())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let andnot = _mm_andnot_pd(a, b).as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let andnot = _mm256_andnot_pd(a, b).as_f64x4(); + transmute(simd_select_bitmask(k, andnot, src.as_f64x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let andnot = _mm256_andnot_pd(a, b).as_f64x4(); + let zero = _mm256_setzero_pd().as_f64x4(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d { + _mm512_and_pd(_mm512_xor_pd(a, transmute(_mm512_set1_epi64(-1))), b) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let andnot = _mm512_andnot_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, andnot, src.as_f64x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandnps))] +// FIXME: should be `vandnpd` instruction. +pub unsafe fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let andnot = _mm512_andnot_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let andnot = _mm_andnot_ps(a, b).as_f32x4(); + transmute(simd_select_bitmask(k, andnot, src.as_f32x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let andnot = _mm_andnot_ps(a, b).as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 { + let andnot = _mm256_andnot_ps(a, b).as_f32x8(); + transmute(simd_select_bitmask(k, andnot, src.as_f32x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 { + let andnot = _mm256_andnot_ps(a, b).as_f32x8(); + let zero = _mm256_setzero_ps().as_f32x8(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 { + _mm512_and_ps(_mm512_xor_ps(a, transmute(_mm512_set1_epi32(-1))), b) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm512_mask_andnot_ps(src: __m512, k: __mmask8, a: __m512, b: __m512) -> __m512 { + let andnot = _mm512_andnot_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, andnot, src.as_f32x16())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm512_maskz_andnot_ps(k: __mmask8, a: __m512, b: __m512) -> __m512 { + let andnot = _mm512_andnot_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +// Or + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let or = _mm_or_pd(a, b).as_f64x2(); + transmute(simd_select_bitmask(k, or, src.as_f64x2())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let or = _mm_or_pd(a, b).as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + transmute(simd_select_bitmask(k, or, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let or = _mm256_or_pd(a, b).as_f64x4(); + transmute(simd_select_bitmask(k, or, src.as_f64x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let or = _mm256_or_pd(a, b).as_f64x4(); + let zero = _mm256_setzero_pd().as_f64x4(); + transmute(simd_select_bitmask(k, or, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d { + transmute(simd_or(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let or = _mm512_or_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, or, src.as_f64x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vorps))] +// FIXME: should be `vorpd` instruction. +pub unsafe fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let or = _mm512_or_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, or, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let or = _mm_or_ps(a, b).as_f32x4(); + transmute(simd_select_bitmask(k, or, src.as_f32x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let or = _mm_or_ps(a, b).as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + transmute(simd_select_bitmask(k, or, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 { + let or = _mm256_or_ps(a, b).as_f32x8(); + transmute(simd_select_bitmask(k, or, src.as_f32x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 { + let or = _mm256_or_ps(a, b).as_f32x8(); + let zero = _mm256_setzero_ps().as_f32x8(); + transmute(simd_select_bitmask(k, or, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 { + transmute(simd_or( + transmute::<_, u32x16>(a), + transmute::<_, u32x16>(b), + )) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm512_mask_or_ps(src: __m512, k: __mmask8, a: __m512, b: __m512) -> __m512 { + let or = _mm512_or_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, or, src.as_f32x16())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vorps))] +pub unsafe fn _mm512_maskz_or_ps(k: __mmask8, a: __m512, b: __m512) -> __m512 { + let or = _mm512_or_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, or, zero)) +} + +// Xor + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let xor = _mm_xor_pd(a, b).as_f64x2(); + transmute(simd_select_bitmask(k, xor, src.as_f64x2())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let xor = _mm_xor_pd(a, b).as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + transmute(simd_select_bitmask(k, xor, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let xor = _mm256_xor_pd(a, b).as_f64x4(); + transmute(simd_select_bitmask(k, xor, src.as_f64x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d { + let xor = _mm256_xor_pd(a, b).as_f64x4(); + let zero = _mm256_setzero_pd().as_f64x4(); + transmute(simd_select_bitmask(k, xor, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d { + transmute(simd_xor(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let xor = _mm512_xor_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, xor, src.as_f64x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vxorps))] +// FIXME: should be `vxorpd` instruction. +pub unsafe fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let xor = _mm512_xor_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, xor, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let xor = _mm_xor_ps(a, b).as_f32x4(); + transmute(simd_select_bitmask(k, xor, src.as_f32x4())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let xor = _mm_xor_ps(a, b).as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + transmute(simd_select_bitmask(k, xor, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 { + let xor = _mm256_xor_ps(a, b).as_f32x8(); + transmute(simd_select_bitmask(k, xor, src.as_f32x8())) +} + +#[inline] +#[target_feature(enable = "avx512dq,avx512vl")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 { + let xor = _mm256_xor_ps(a, b).as_f32x8(); + let zero = _mm256_setzero_ps().as_f32x8(); + transmute(simd_select_bitmask(k, xor, zero)) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 { + transmute(simd_xor( + transmute::<_, u32x16>(a), + transmute::<_, u32x16>(b), + )) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_mask_xor_ps(src: __m512, k: __mmask8, a: __m512, b: __m512) -> __m512 { + let xor = _mm512_xor_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, xor, src.as_f32x16())) +} + +#[inline] +#[target_feature(enable = "avx512dq")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_maskz_xor_ps(k: __mmask8, a: __m512, b: __m512) -> __m512 { + let xor = _mm512_xor_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, xor, zero)) +}