From aa84427fd49f8123067fa3647bb3fb55a8403b20 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sun, 14 Jul 2024 23:19:26 +0530 Subject: [PATCH] Use LLVM intrinsics for masked load/stores, expand-loads and fp-class Also, remove some redundant sse target-features from avx intrinsics --- .../stdarch/crates/core_arch/src/x86/avx.rs | 30 +- .../crates/core_arch/src/x86/avx512bw.rs | 232 +--- .../crates/core_arch/src/x86/avx512dq.rs | 97 +- .../crates/core_arch/src/x86/avx512f.rs | 1202 +++++------------ .../crates/core_arch/src/x86/avx512vbmi2.rs | 158 +-- 5 files changed, 479 insertions(+), 1240 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs index 7726a188f2bf..51265b0380a2 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx.rs @@ -737,7 +737,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f; /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] @@ -767,7 +767,7 @@ pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d) -> __m256d /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps) #[inline] -#[target_feature(enable = "avx,sse")] +#[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] @@ -799,7 +799,7 @@ pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256) -> __m256 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] @@ -816,7 +816,7 @@ pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d) -> __m128d { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss) #[inline] -#[target_feature(enable = "avx,sse")] +#[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] @@ -1093,7 +1093,7 @@ pub unsafe fn _mm256_permute_ps(a: __m256) -> __m256 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps) #[inline] -#[target_feature(enable = "avx,sse")] +#[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] @@ -1163,7 +1163,7 @@ pub unsafe fn _mm256_permute_pd(a: __m256d) -> __m256d { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))] #[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] @@ -2733,7 +2733,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256) #[inline] -#[target_feature(enable = "avx,sse")] +#[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. #[stable(feature = "simd_x86", since = "1.27.0")] @@ -2747,7 +2747,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. #[stable(feature = "simd_x86", since = "1.27.0")] @@ -2764,7 +2764,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] // This intrinsic is only used for compilation and does not generate any // instructions, thus it has zero latency. #[stable(feature = "simd_x86", since = "1.27.0")] @@ -2888,7 +2888,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128) #[inline] -#[target_feature(enable = "avx,sse")] +#[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 { @@ -2903,7 +2903,7 @@ pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m2 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d { @@ -2917,7 +2917,7 @@ pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i { @@ -2932,7 +2932,7 @@ pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128) #[inline] -#[target_feature(enable = "avx,sse")] +#[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) { @@ -2949,7 +2949,7 @@ pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) { @@ -2965,7 +2965,7 @@ pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i) #[inline] -#[target_feature(enable = "avx,sse2")] +#[target_feature(enable = "avx")] // This intrinsic has no corresponding instruction. #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) { diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index 00f2dc5fde0e..40b358bc2a22 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -1,5 +1,4 @@ use crate::{ - arch::asm, core_arch::{simd::*, x86::*}, intrinsics::simd::*, ptr, @@ -5396,19 +5395,11 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw")] +#[target_feature(enable = "avx512bw")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqu16 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu16_512(mem_addr, src.as_i16x32(), k)) } /// Load packed 16-bit integers from memory into dst using zeromask k @@ -5417,19 +5408,11 @@ pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *con /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw")] +#[target_feature(enable = "avx512bw")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_loadu_epi16(_mm512_setzero_si512(), k, mem_addr) } /// Load packed 8-bit integers from memory into dst using writemask k @@ -5438,19 +5421,11 @@ pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __ /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw")] +#[target_feature(enable = "avx512bw")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqu8 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu8_512(mem_addr, src.as_i8x64(), k)) } /// Load packed 8-bit integers from memory into dst using zeromask k @@ -5459,19 +5434,11 @@ pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *cons /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw")] +#[target_feature(enable = "avx512bw")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_loadu_epi8(_mm512_setzero_si512(), k, mem_addr) } /// Load packed 16-bit integers from memory into dst using writemask k @@ -5480,19 +5447,11 @@ pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m5 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqu16 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu16_256(mem_addr, src.as_i16x16(), k)) } /// Load packed 16-bit integers from memory into dst using zeromask k @@ -5501,19 +5460,11 @@ pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *con /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_loadu_epi16(_mm256_setzero_si256(), k, mem_addr) } /// Load packed 8-bit integers from memory into dst using writemask k @@ -5522,19 +5473,11 @@ pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __ /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqu8 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu8_256(mem_addr, src.as_i8x32(), k)) } /// Load packed 8-bit integers from memory into dst using zeromask k @@ -5543,19 +5486,11 @@ pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *cons /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_loadu_epi8(_mm256_setzero_si256(), k, mem_addr) } /// Load packed 16-bit integers from memory into dst using writemask k @@ -5564,19 +5499,11 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqu16 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu16_128(mem_addr, src.as_i16x8(), k)) } /// Load packed 16-bit integers from memory into dst using zeromask k @@ -5585,19 +5512,11 @@ pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_loadu_epi16(_mm_setzero_si128(), k, mem_addr) } /// Load packed 8-bit integers from memory into dst using writemask k @@ -5606,19 +5525,11 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqu8 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu8_128(mem_addr, src.as_i8x16(), k)) } /// Load packed 8-bit integers from memory into dst using zeromask k @@ -5627,19 +5538,11 @@ pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_loadu_epi8(_mm_setzero_si128(), k, mem_addr) } /// Store packed 16-bit integers from a into memory using writemask k. @@ -5647,17 +5550,11 @@ pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw")] +#[target_feature(enable = "avx512bw")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) { - asm!( - vps!("vmovdqu16", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu16_512(mem_addr, a.as_i16x32(), mask) } /// Store packed 8-bit integers from a into memory using writemask k. @@ -5665,17 +5562,11 @@ pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: _ /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw")] +#[target_feature(enable = "avx512bw")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) { - asm!( - vps!("vmovdqu8", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu8_512(mem_addr, a.as_i8x64(), mask) } /// Store packed 16-bit integers from a into memory using writemask k. @@ -5683,17 +5574,11 @@ pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) { - asm!( - vps!("vmovdqu16", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storedqu16_256(mem_addr, a.as_i16x16(), mask) } /// Store packed 8-bit integers from a into memory using writemask k. @@ -5701,17 +5586,11 @@ pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: _ /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) { - asm!( - vps!("vmovdqu8", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storedqu8_256(mem_addr, a.as_i8x32(), mask) } /// Store packed 16-bit integers from a into memory using writemask k. @@ -5719,17 +5598,11 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu16))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqu16", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu16_128(mem_addr, a.as_i16x8(), mask) } /// Store packed 8-bit integers from a into memory using writemask k. @@ -5737,17 +5610,11 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +#[target_feature(enable = "avx512bw,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu8))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) { - asm!( - vps!("vmovdqu8", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu8_128(mem_addr, a.as_i8x16(), mask) } /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst. @@ -11753,6 +11620,33 @@ extern "C" { fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16); #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"] fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8); + + #[link_name = "llvm.x86.avx512.mask.loadu.b.128"] + fn loaddqu8_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16; + #[link_name = "llvm.x86.avx512.mask.loadu.w.128"] + fn loaddqu16_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8; + #[link_name = "llvm.x86.avx512.mask.loadu.b.256"] + fn loaddqu8_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32; + #[link_name = "llvm.x86.avx512.mask.loadu.w.256"] + fn loaddqu16_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16; + #[link_name = "llvm.x86.avx512.mask.loadu.b.512"] + fn loaddqu8_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64; + #[link_name = "llvm.x86.avx512.mask.loadu.w.512"] + fn loaddqu16_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32; + + #[link_name = "llvm.x86.avx512.mask.storeu.b.128"] + fn storedqu8_128(mem_addr: *mut i8, a: i8x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.storeu.w.128"] + fn storedqu16_128(mem_addr: *mut i16, a: i16x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.b.256"] + fn storedqu8_256(mem_addr: *mut i8, a: i8x32, mask: u32); + #[link_name = "llvm.x86.avx512.mask.storeu.w.256"] + fn storedqu16_256(mem_addr: *mut i16, a: i16x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.storeu.b.512"] + fn storedqu8_512(mem_addr: *mut i8, a: i8x64, mask: u64); + #[link_name = "llvm.x86.avx512.mask.storeu.w.512"] + fn storedqu16_512(mem_addr: *mut i16, a: i16x32, mask: u32); + } #[cfg(test)] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs index a889509871a1..320315155829 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs @@ -6409,33 +6409,6 @@ pub unsafe fn _mm_maskz_reduce_ss(k: __mmask8, a: __m128, b: __ // FP-Class -// FIXME: Use LLVM intrinsics instead of inline assembly -macro_rules! fpclass_asm { - ($instr:literal, $mask_type: ty, $reg: ident, $a: expr) => {{ - let dst: $mask_type; - $crate::arch::asm!( - concat!($instr, " {k}, {src}, {imm8}"), - k = lateout(kreg) dst, - src = in($reg) $a, - imm8 = const IMM8, - options(pure, nomem, nostack) - ); - dst - }}; - ($instr:literal, $mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{ - let dst: $mask_type; - $crate::arch::asm!( - concat!($instr, " {k} {{ {mask} }}, {src}, {imm8}"), - k = lateout(kreg) dst, - mask = in(kreg) $mask, - src = in($reg) $a, - imm8 = const IMM8, - options(pure, nomem, nostack) - ); - dst - }}; -} - /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified /// by imm8, and store the results in mask vector k. /// imm can be a combination of: @@ -6451,13 +6424,13 @@ macro_rules! fpclass_asm { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_pd_mask&ig_expand=3493) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_fpclass_pd_mask(a: __m128d) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclasspd", __mmask8, xmm_reg, a) + _mm_mask_fpclass_pd_mask::(0xff, a) } /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified @@ -6476,13 +6449,13 @@ pub unsafe fn _mm_fpclass_pd_mask(a: __m128d) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_pd_mask&ig_expand=3494) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_fpclass_pd_mask(k1: __mmask8, a: __m128d) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclasspd", __mmask8, k1, xmm_reg, a) + transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1)) } /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified @@ -6500,13 +6473,13 @@ pub unsafe fn _mm_mask_fpclass_pd_mask(k1: __mmask8, a: __m128d /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_pd_mask&ig_expand=3495) #[inline] -#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_fpclass_pd_mask(a: __m256d) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclasspd", __mmask8, ymm_reg, a) + _mm256_mask_fpclass_pd_mask::(0xff, a) } /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified @@ -6525,13 +6498,13 @@ pub unsafe fn _mm256_fpclass_pd_mask(a: __m256d) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_pd_mask&ig_expand=3496) #[inline] -#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_fpclass_pd_mask(k1: __mmask8, a: __m256d) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclasspd", __mmask8, k1, ymm_reg, a) + transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1)) } /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified @@ -6549,13 +6522,13 @@ pub unsafe fn _mm256_mask_fpclass_pd_mask(k1: __mmask8, a: __m2 /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_pd_mask&ig_expand=3497) #[inline] -#[target_feature(enable = "avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_fpclass_pd_mask(a: __m512d) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclasspd", __mmask8, zmm_reg, a) + _mm512_mask_fpclass_pd_mask::(0xff, a) } /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified @@ -6574,13 +6547,13 @@ pub unsafe fn _mm512_fpclass_pd_mask(a: __m512d) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_pd_mask&ig_expand=3498) #[inline] -#[target_feature(enable = "avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_fpclass_pd_mask(k1: __mmask8, a: __m512d) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclasspd", __mmask8, k1, zmm_reg, a) + transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1)) } /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified @@ -6598,13 +6571,13 @@ pub unsafe fn _mm512_mask_fpclass_pd_mask(k1: __mmask8, a: __m5 /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ps_mask&ig_expand=3505) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_fpclass_ps_mask(a: __m128) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclassps", __mmask8, xmm_reg, a) + _mm_mask_fpclass_ps_mask::(0xff, a) } /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified @@ -6623,13 +6596,13 @@ pub unsafe fn _mm_fpclass_ps_mask(a: __m128) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ps_mask&ig_expand=3506) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_fpclass_ps_mask(k1: __mmask8, a: __m128) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclassps", __mmask8, k1, xmm_reg, a) + transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1)) } /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified @@ -6647,13 +6620,13 @@ pub unsafe fn _mm_mask_fpclass_ps_mask(k1: __mmask8, a: __m128) /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_ps_mask&ig_expand=3507) #[inline] -#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_fpclass_ps_mask(a: __m256) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclassps", __mmask8, ymm_reg, a) + _mm256_mask_fpclass_ps_mask::(0xff, a) } /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified @@ -6672,13 +6645,13 @@ pub unsafe fn _mm256_fpclass_ps_mask(a: __m256) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_ps_mask&ig_expand=3508) #[inline] -#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")] +#[target_feature(enable = "avx512dq,avx512vl")] #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_fpclass_ps_mask(k1: __mmask8, a: __m256) -> __mmask8 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclassps", __mmask8, k1, ymm_reg, a) + transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1)) } /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified @@ -6696,13 +6669,13 @@ pub unsafe fn _mm256_mask_fpclass_ps_mask(k1: __mmask8, a: __m2 /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_ps_mask&ig_expand=3509) #[inline] -#[target_feature(enable = "avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_fpclass_ps_mask(a: __m512) -> __mmask16 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclassps", __mmask16, zmm_reg, a) + _mm512_mask_fpclass_ps_mask::(0xffff, a) } /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified @@ -6721,13 +6694,13 @@ pub unsafe fn _mm512_fpclass_ps_mask(a: __m512) -> __mmask16 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_ps_mask&ig_expand=3510) #[inline] -#[target_feature(enable = "avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_fpclass_ps_mask(k1: __mmask16, a: __m512) -> __mmask16 { static_assert_uimm_bits!(IMM8, 8); - fpclass_asm!("vfpclassps", __mmask16, k1, zmm_reg, a) + transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1)) } /// Test the lower double-precision (64-bit) floating-point element in a for special categories specified @@ -6745,7 +6718,7 @@ pub unsafe fn _mm512_mask_fpclass_ps_mask(k1: __mmask16, a: __m /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_sd_mask&ig_expand=3511) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -6770,7 +6743,7 @@ pub unsafe fn _mm_fpclass_sd_mask(a: __m128d) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_sd_mask&ig_expand=3512) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -6794,7 +6767,7 @@ pub unsafe fn _mm_mask_fpclass_sd_mask(k1: __mmask8, a: __m128d /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ss_mask&ig_expand=3515) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))] #[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -6819,7 +6792,7 @@ pub unsafe fn _mm_fpclass_ss_mask(a: __m128) -> __mmask8 { /// /// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ss_mask&ig_expand=3516) #[inline] -#[target_feature(enable = "sse,avx512f,avx512dq")] +#[target_feature(enable = "avx512dq")] #[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))] #[rustc_legacy_const_generics(2)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -6953,6 +6926,20 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.reduce.ss"] fn vreducess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.fpclass.pd.128"] + fn vfpclasspd_128(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8; + #[link_name = "llvm.x86.avx512.mask.fpclass.pd.256"] + fn vfpclasspd_256(a: f64x4, imm8: i32, k: __mmask8) -> __mmask8; + #[link_name = "llvm.x86.avx512.mask.fpclass.pd.512"] + fn vfpclasspd_512(a: f64x8, imm8: i32, k: __mmask8) -> __mmask8; + + #[link_name = "llvm.x86.avx512.mask.fpclass.ps.128"] + fn vfpclassps_128(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8; + #[link_name = "llvm.x86.avx512.mask.fpclass.ps.256"] + fn vfpclassps_256(a: f32x8, imm8: i32, k: __mmask8) -> __mmask8; + #[link_name = "llvm.x86.avx512.mask.fpclass.ps.512"] + fn vfpclassps_512(a: f32x16, imm8: i32, k: __mmask16) -> __mmask16; + #[link_name = "llvm.x86.avx512.mask.fpclass.sd"] fn vfpclasssd(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8; #[link_name = "llvm.x86.avx512.mask.fpclass.ss"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index 032d5b4ab706..1dcdcab3d852 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -34002,15 +34002,7 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqu32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu32_512(mem_addr, src.as_i32x16(), k)) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34023,15 +34015,7 @@ pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *con #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_loadu_epi32(_mm512_setzero_si512(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -34044,15 +34028,7 @@ pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __ #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqu64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu64_512(mem_addr, src.as_i64x8(), k)) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34065,15 +34041,7 @@ pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *cons #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_loadu_epi64(_mm512_setzero_si512(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -34086,15 +34054,7 @@ pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512 = src; - asm!( - vpl!("vmovups {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadups_512(mem_addr, src.as_f32x16(), k)) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34107,15 +34067,7 @@ pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512; - asm!( - vpl!("vmovups {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_loadu_ps(_mm512_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -34128,15 +34080,7 @@ pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m51 #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d = src; - asm!( - vpl!("vmovupd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadupd_512(mem_addr, src.as_f64x8(), k)) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34149,15 +34093,7 @@ pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d; - asm!( - vpl!("vmovupd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_loadu_pd(_mm512_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -34166,19 +34102,11 @@ pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqu32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu32_256(mem_addr, src.as_i32x8(), k)) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34187,19 +34115,11 @@ pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *cons /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_loadu_epi32(_mm256_setzero_si256(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -34208,19 +34128,11 @@ pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqu64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu64_256(mem_addr, src.as_i64x4(), k)) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34229,19 +34141,11 @@ pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *cons /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_loadu_epi64(_mm256_setzero_si256(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -34250,19 +34154,11 @@ pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256 = src; - asm!( - vpl!("vmovups {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadups_256(mem_addr, src.as_f32x8(), k)) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34271,19 +34167,11 @@ pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f3 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256; - asm!( - vpl!("vmovups {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_loadu_ps(_mm256_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -34292,19 +34180,11 @@ pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d = src; - asm!( - vpl!("vmovupd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadupd_256(mem_addr, src.as_f64x4(), k)) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34313,19 +34193,11 @@ pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d; - asm!( - vpl!("vmovupd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_loadu_pd(_mm256_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -34334,19 +34206,11 @@ pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqu32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu32_128(mem_addr, src.as_i32x4(), k)) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34355,19 +34219,11 @@ pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_loadu_epi32(_mm_setzero_si128(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -34376,19 +34232,11 @@ pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqu64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqu64_128(mem_addr, src.as_i64x2(), k)) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34397,19 +34245,11 @@ pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_loadu_epi64(_mm_setzero_si128(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -34418,19 +34258,11 @@ pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128 = src; - asm!( - vpl!("vmovups {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadups_128(mem_addr, src.as_f32x4(), k)) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34439,19 +34271,11 @@ pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128; - asm!( - vpl!("vmovups {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_loadu_ps(_mm_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -34460,19 +34284,11 @@ pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d = src; - asm!( - vpl!("vmovupd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadupd_128(mem_addr, src.as_f64x2(), k)) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34481,19 +34297,11 @@ pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d; - asm!( - vpl!("vmovupd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_loadu_pd(_mm_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -34503,17 +34311,10 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi32) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqa32_512(mem_addr, src.as_i32x16(), k)) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34523,17 +34324,10 @@ pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *cons /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi32) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_load_epi32(_mm512_setzero_si512(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -34543,17 +34337,10 @@ pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_epi64) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqa64_512(mem_addr, src.as_i64x8(), k)) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34563,17 +34350,10 @@ pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_epi64) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_load_epi64(_mm512_setzero_si512(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -34583,17 +34363,10 @@ pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m5 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_ps) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512 = src; - asm!( - vpl!("vmovaps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadaps_512(mem_addr, src.as_f32x16(), k)) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34603,17 +34376,10 @@ pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f3 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_ps) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512; - asm!( - vpl!("vmovaps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_load_ps(_mm512_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -34623,17 +34389,10 @@ pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_load_pd) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d = src; - asm!( - vpl!("vmovapd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadapd_512(mem_addr, src.as_f64x8(), k)) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34643,17 +34402,10 @@ pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f6 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_load_pd) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d; - asm!( - vpl!("vmovapd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_load_pd(_mm512_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -34662,18 +34414,11 @@ pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqa32_256(mem_addr, src.as_i32x8(), k)) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34682,18 +34427,11 @@ pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_load_epi32(_mm256_setzero_si256(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -34702,18 +34440,11 @@ pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m2 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqa64_256(mem_addr, src.as_i64x4(), k)) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34722,18 +34453,11 @@ pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_load_epi64(_mm256_setzero_si256(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -34742,18 +34466,11 @@ pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m2 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256 = src; - asm!( - vpl!("vmovaps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadaps_256(mem_addr, src.as_f32x8(), k)) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34762,18 +34479,11 @@ pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256; - asm!( - vpl!("vmovaps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_load_ps(_mm256_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -34782,18 +34492,11 @@ pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d = src; - asm!( - vpl!("vmovapd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadapd_256(mem_addr, src.as_f64x4(), k)) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34802,18 +34505,11 @@ pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f6 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d; - asm!( - vpl!("vmovapd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_load_pd(_mm256_setzero_pd(), k, mem_addr) } /// Load packed 32-bit integers from memory into dst using writemask k @@ -34822,18 +34518,11 @@ pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqa32_128(mem_addr, src.as_i32x4(), k)) } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34842,18 +34531,11 @@ pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i3 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_load_epi32(_mm_setzero_si128(), k, mem_addr) } /// Load packed 64-bit integers from memory into dst using writemask k @@ -34862,18 +34544,11 @@ pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loaddqa64_128(mem_addr, src.as_i64x2(), k)) } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34882,18 +34557,11 @@ pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i6 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_load_epi64(_mm_setzero_si128(), k, mem_addr) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k @@ -34902,18 +34570,11 @@ pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128 = src; - asm!( - vpl!("vmovaps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadaps_128(mem_addr, src.as_f32x4(), k)) } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34922,18 +34583,11 @@ pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) - /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128; - asm!( - vpl!("vmovaps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_load_ps(_mm_setzero_ps(), k, mem_addr) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k @@ -34942,18 +34596,11 @@ pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d = src; - asm!( - vpl!("vmovapd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(loadapd_128(mem_addr, src.as_f64x2(), k)) } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34962,18 +34609,11 @@ pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d; - asm!( - vpl!("vmovapd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_load_pd(_mm_setzero_pd(), k, mem_addr) } /// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst @@ -35073,13 +34713,7 @@ pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d { #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { - asm!( - vps!("vmovdqu32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu32_512(mem_addr, a.as_i32x16(), mask) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35091,13 +34725,7 @@ pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: _ #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { - asm!( - vps!("vmovdqu64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu64_512(mem_addr, a.as_i64x8(), mask) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35109,13 +34737,7 @@ pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { - asm!( - vps!("vmovups", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storeups_512(mem_addr, a.as_f32x16(), mask) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35127,13 +34749,7 @@ pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m5 #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { - asm!( - vps!("vmovupd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storeupd_512(mem_addr, a.as_f64x8(), mask) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35141,17 +34757,11 @@ pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m51 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { - asm!( - vps!("vmovdqu32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storedqu32_256(mem_addr, a.as_i32x8(), mask) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35159,17 +34769,11 @@ pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __ /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { - asm!( - vps!("vmovdqu64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storedqu64_256(mem_addr, a.as_i64x4(), mask) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35177,17 +34781,11 @@ pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { - asm!( - vps!("vmovups", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storeups_256(mem_addr, a.as_f32x8(), mask) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35195,17 +34793,11 @@ pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m25 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { - asm!( - vps!("vmovupd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storeupd_256(mem_addr, a.as_f64x4(), mask) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35213,17 +34805,11 @@ pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m25 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqu32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu32_128(mem_addr, a.as_i32x4(), mask) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35231,17 +34817,11 @@ pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m12 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovdqu64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqu64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storedqu64_128(mem_addr, a.as_i64x2(), mask) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35249,17 +34829,11 @@ pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m12 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovups))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { - asm!( - vps!("vmovups", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storeups_128(mem_addr, a.as_f32x4(), mask) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35267,17 +34841,11 @@ pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vmovupd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { - asm!( - vps!("vmovupd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storeupd_128(mem_addr, a.as_f64x2(), mask) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35286,15 +34854,10 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi32) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { - asm!( - vps!("vmovdqa32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storedqa32_512(mem_addr, a.as_i32x16(), mask) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35303,15 +34866,10 @@ pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_epi64) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { - asm!( - vps!("vmovdqa64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storedqa64_512(mem_addr, a.as_i64x8(), mask) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35320,15 +34878,10 @@ pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_ps) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { - asm!( - vps!("vmovaps", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storeaps_512(mem_addr, a.as_f32x16(), mask) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35337,15 +34890,10 @@ pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m51 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_store_pd) #[inline] #[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { - asm!( - vps!("vmovapd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(zmm_reg) a, - options(nostack, preserves_flags) - ); + storeapd_512(mem_addr, a.as_f64x8(), mask) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35353,16 +34901,11 @@ pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { - asm!( - vps!("vmovdqa32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storedqa32_256(mem_addr, a.as_i32x8(), mask) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35370,16 +34913,11 @@ pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { - asm!( - vps!("vmovdqa64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storedqa64_256(mem_addr, a.as_i64x4(), mask) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35387,16 +34925,11 @@ pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { - asm!( - vps!("vmovaps", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storeaps_256(mem_addr, a.as_f32x8(), mask) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35404,16 +34937,11 @@ pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_store_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { - asm!( - vps!("vmovapd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(ymm_reg) a, - options(nostack, preserves_flags) - ); + storeapd_256(mem_addr, a.as_f64x4(), mask) } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35421,16 +34949,11 @@ pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa32))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqa32", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storedqa32_128(mem_addr, a.as_i32x4(), mask) } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35438,16 +34961,11 @@ pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovdqa64))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { - asm!( - vps!("vmovdqa64", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storedqa64_128(mem_addr, a.as_i64x2(), mask) } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35455,16 +34973,11 @@ pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128 /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovaps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { - asm!( - vps!("vmovaps", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storeaps_128(mem_addr, a.as_f32x4(), mask) } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35472,16 +34985,11 @@ pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_store_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vmovapd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { - asm!( - vps!("vmovapd", "{{{mask}}}, {a}"), - p = in(reg) mem_addr, - mask = in(kreg) mask, - a = in(xmm_reg) a, - options(nostack, preserves_flags) - ); + storeapd_128(mem_addr, a.as_f64x2(), mask) } /// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr @@ -35532,15 +35040,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi32( k: __mmask16, mem_addr: *const i32, ) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vpexpandd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadd_512(mem_addr, src.as_i32x16(), k)) } /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -35551,22 +35051,14 @@ pub unsafe fn _mm512_mask_expandloadu_epi32( #[cfg_attr(test, assert_instr(vpexpandd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vpexpandd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_expandloadu_epi32(_mm512_setzero_si512(), k, mem_addr) } /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_expandloadu_epi32( @@ -35574,41 +35066,25 @@ pub unsafe fn _mm256_mask_expandloadu_epi32( k: __mmask8, mem_addr: *const i32, ) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vpexpandd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadd_256(mem_addr, src.as_i32x8(), k)) } /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vpexpandd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_expandloadu_epi32(_mm256_setzero_si256(), k, mem_addr) } /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_expandloadu_epi32( @@ -35616,34 +35092,18 @@ pub unsafe fn _mm_mask_expandloadu_epi32( k: __mmask8, mem_addr: *const i32, ) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vpexpandd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadd_128(mem_addr, src.as_i32x4(), k)) } /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vpexpandd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_expandloadu_epi32(_mm_setzero_si128(), k, mem_addr) } /// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -35658,15 +35118,7 @@ pub unsafe fn _mm512_mask_expandloadu_epi64( k: __mmask8, mem_addr: *const i64, ) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vpexpandq {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadq_512(mem_addr, src.as_i64x8(), k)) } /// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -35677,22 +35129,14 @@ pub unsafe fn _mm512_mask_expandloadu_epi64( #[cfg_attr(test, assert_instr(vpexpandq))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vpexpandq {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_expandloadu_epi64(_mm512_setzero_si512(), k, mem_addr) } /// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandq))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_expandloadu_epi64( @@ -35700,41 +35144,25 @@ pub unsafe fn _mm256_mask_expandloadu_epi64( k: __mmask8, mem_addr: *const i64, ) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vpexpandq {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadq_256(mem_addr, src.as_i64x4(), k)) } /// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandq))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vpexpandq {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_expandloadu_epi64(_mm256_setzero_si256(), k, mem_addr) } /// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandq))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_expandloadu_epi64( @@ -35742,34 +35170,18 @@ pub unsafe fn _mm_mask_expandloadu_epi64( k: __mmask8, mem_addr: *const i64, ) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vpexpandq {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadq_128(mem_addr, src.as_i64x2(), k)) } /// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandq))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vpexpandq {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_expandloadu_epi64(_mm_setzero_si128(), k, mem_addr) } /// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -35784,15 +35196,7 @@ pub unsafe fn _mm512_mask_expandloadu_ps( k: __mmask16, mem_addr: *const f32, ) -> __m512 { - let mut dst: __m512 = src; - asm!( - vpl!("vexpandps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadps_512(mem_addr, src.as_f32x16(), k)) } /// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -35803,91 +35207,51 @@ pub unsafe fn _mm512_mask_expandloadu_ps( #[cfg_attr(test, assert_instr(vexpandps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { - let mut dst: __m512; - asm!( - vpl!("vexpandps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), k, mem_addr) } /// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256 = src; - asm!( - vpl!("vexpandps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadps_256(mem_addr, src.as_f32x8(), k)) } /// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { - let mut dst: __m256; - asm!( - vpl!("vexpandps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_expandloadu_ps(_mm256_setzero_ps(), k, mem_addr) } /// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128 = src; - asm!( - vpl!("vexpandps {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadps_128(mem_addr, src.as_f32x4(), k)) } /// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandps))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { - let mut dst: __m128; - asm!( - vpl!("vexpandps {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_expandloadu_ps(_mm_setzero_ps(), k, mem_addr) } /// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -35902,15 +35266,7 @@ pub unsafe fn _mm512_mask_expandloadu_pd( k: __mmask8, mem_addr: *const f64, ) -> __m512d { - let mut dst: __m512d = src; - asm!( - vpl!("vexpandpd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadpd_512(mem_addr, src.as_f64x8(), k)) } /// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -35921,22 +35277,14 @@ pub unsafe fn _mm512_mask_expandloadu_pd( #[cfg_attr(test, assert_instr(vexpandpd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { - let mut dst: __m512d; - asm!( - vpl!("vexpandpd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_expandloadu_pd(_mm512_setzero_pd(), k, mem_addr) } /// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandpd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_expandloadu_pd( @@ -35944,72 +35292,40 @@ pub unsafe fn _mm256_mask_expandloadu_pd( k: __mmask8, mem_addr: *const f64, ) -> __m256d { - let mut dst: __m256d = src; - asm!( - vpl!("vexpandpd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadpd_256(mem_addr, src.as_f64x4(), k)) } /// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandpd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { - let mut dst: __m256d; - asm!( - vpl!("vexpandpd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_expandloadu_pd(_mm256_setzero_pd(), k, mem_addr) } /// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandpd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d = src; - asm!( - vpl!("vexpandpd {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadpd_128(mem_addr, src.as_f64x2(), k)) } /// Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +#[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vexpandpd))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { - let mut dst: __m128d; - asm!( - vpl!("vexpandpd {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_expandloadu_pd(_mm_setzero_pd(), k, mem_addr) } /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order. @@ -42848,6 +42164,132 @@ extern "C" { fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32; #[link_name = "llvm.x86.avx512.vcomi.sd"] fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32; + + #[link_name = "llvm.x86.avx512.mask.loadu.d.128"] + fn loaddqu32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4; + #[link_name = "llvm.x86.avx512.mask.loadu.q.128"] + fn loaddqu64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2; + #[link_name = "llvm.x86.avx512.mask.loadu.ps.128"] + fn loadups_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.loadu.pd.128"] + fn loadupd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.loadu.d.256"] + fn loaddqu32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8; + #[link_name = "llvm.x86.avx512.mask.loadu.q.256"] + fn loaddqu64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4; + #[link_name = "llvm.x86.avx512.mask.loadu.ps.256"] + fn loadups_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8; + #[link_name = "llvm.x86.avx512.mask.loadu.pd.256"] + fn loadupd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4; + #[link_name = "llvm.x86.avx512.mask.loadu.d.512"] + fn loaddqu32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16; + #[link_name = "llvm.x86.avx512.mask.loadu.q.512"] + fn loaddqu64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8; + #[link_name = "llvm.x86.avx512.mask.loadu.ps.512"] + fn loadups_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16; + #[link_name = "llvm.x86.avx512.mask.loadu.pd.512"] + fn loadupd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8; + + #[link_name = "llvm.x86.avx512.mask.load.d.128"] + fn loaddqa32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4; + #[link_name = "llvm.x86.avx512.mask.load.q.128"] + fn loaddqa64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2; + #[link_name = "llvm.x86.avx512.mask.load.ps.128"] + fn loadaps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.load.pd.128"] + fn loadapd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.load.d.256"] + fn loaddqa32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8; + #[link_name = "llvm.x86.avx512.mask.load.q.256"] + fn loaddqa64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4; + #[link_name = "llvm.x86.avx512.mask.load.ps.256"] + fn loadaps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8; + #[link_name = "llvm.x86.avx512.mask.load.pd.256"] + fn loadapd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4; + #[link_name = "llvm.x86.avx512.mask.load.d.512"] + fn loaddqa32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16; + #[link_name = "llvm.x86.avx512.mask.load.q.512"] + fn loaddqa64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8; + #[link_name = "llvm.x86.avx512.mask.load.ps.512"] + fn loadaps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16; + #[link_name = "llvm.x86.avx512.mask.load.pd.512"] + fn loadapd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8; + + #[link_name = "llvm.x86.avx512.mask.storeu.d.128"] + fn storedqu32_128(mem_addr: *mut i32, a: i32x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.q.128"] + fn storedqu64_128(mem_addr: *mut i64, a: i64x2, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.ps.128"] + fn storeups_128(mem_addr: *mut f32, a: f32x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.pd.128"] + fn storeupd_128(mem_addr: *mut f64, a: f64x2, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.d.256"] + fn storedqu32_256(mem_addr: *mut i32, a: i32x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.q.256"] + fn storedqu64_256(mem_addr: *mut i64, a: i64x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.ps.256"] + fn storeups_256(mem_addr: *mut f32, a: f32x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.pd.256"] + fn storeupd_256(mem_addr: *mut f64, a: f64x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.d.512"] + fn storedqu32_512(mem_addr: *mut i32, a: i32x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.storeu.q.512"] + fn storedqu64_512(mem_addr: *mut i64, a: i64x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.storeu.ps.512"] + fn storeups_512(mem_addr: *mut f32, a: f32x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.storeu.pd.512"] + fn storeupd_512(mem_addr: *mut f64, a: f64x8, mask: u8); + + #[link_name = "llvm.x86.avx512.mask.store.d.128"] + fn storedqa32_128(mem_addr: *mut i32, a: i32x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.q.128"] + fn storedqa64_128(mem_addr: *mut i64, a: i64x2, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.ps.128"] + fn storeaps_128(mem_addr: *mut f32, a: f32x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.pd.128"] + fn storeapd_128(mem_addr: *mut f64, a: f64x2, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.d.256"] + fn storedqa32_256(mem_addr: *mut i32, a: i32x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.q.256"] + fn storedqa64_256(mem_addr: *mut i64, a: i64x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.ps.256"] + fn storeaps_256(mem_addr: *mut f32, a: f32x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.pd.256"] + fn storeapd_256(mem_addr: *mut f64, a: f64x4, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.d.512"] + fn storedqa32_512(mem_addr: *mut i32, a: i32x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.store.q.512"] + fn storedqa64_512(mem_addr: *mut i64, a: i64x8, mask: u8); + #[link_name = "llvm.x86.avx512.mask.store.ps.512"] + fn storeaps_512(mem_addr: *mut f32, a: f32x16, mask: u16); + #[link_name = "llvm.x86.avx512.mask.store.pd.512"] + fn storeapd_512(mem_addr: *mut f64, a: f64x8, mask: u8); + + #[link_name = "llvm.x86.avx512.mask.expand.load.d.128"] + fn expandloadd_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4; + #[link_name = "llvm.x86.avx512.mask.expand.load.q.128"] + fn expandloadq_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2; + #[link_name = "llvm.x86.avx512.mask.expand.load.ps.128"] + fn expandloadps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.expand.load.pd.128"] + fn expandloadpd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.expand.load.d.256"] + fn expandloadd_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8; + #[link_name = "llvm.x86.avx512.mask.expand.load.q.256"] + fn expandloadq_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4; + #[link_name = "llvm.x86.avx512.mask.expand.load.ps.256"] + fn expandloadps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8; + #[link_name = "llvm.x86.avx512.mask.expand.load.pd.256"] + fn expandloadpd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4; + #[link_name = "llvm.x86.avx512.mask.expand.load.d.512"] + fn expandloadd_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16; + #[link_name = "llvm.x86.avx512.mask.expand.load.q.512"] + fn expandloadq_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8; + #[link_name = "llvm.x86.avx512.mask.expand.load.ps.512"] + fn expandloadps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16; + #[link_name = "llvm.x86.avx512.mask.expand.load.pd.512"] + fn expandloadpd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8; + } #[cfg(test)] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs index eddbcbe48ab9..da30fd96db1b 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs @@ -1,5 +1,4 @@ use crate::{ - arch::asm, core_arch::{simd::*, x86::*}, intrinsics::simd::*, }; @@ -11,7 +10,7 @@ use stdarch_test::assert_instr; /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")] +#[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpexpandw))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_expandloadu_epi16( @@ -19,41 +18,25 @@ pub unsafe fn _mm512_mask_expandloadu_epi16( k: __mmask32, mem_addr: *const i16, ) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vpexpandw {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadw_512(mem_addr, src.as_i16x32(), k)) } /// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")] +#[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpexpandw))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vpexpandw {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_expandloadu_epi16(_mm512_setzero_si512(), k, mem_addr) } /// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandw))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_expandloadu_epi16( @@ -61,41 +44,25 @@ pub unsafe fn _mm256_mask_expandloadu_epi16( k: __mmask16, mem_addr: *const i16, ) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vpexpandw {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadw_256(mem_addr, src.as_i16x16(), k)) } /// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandw))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vpexpandw {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_expandloadu_epi16(_mm256_setzero_si256(), k, mem_addr) } /// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandw))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_expandloadu_epi16( @@ -103,41 +70,25 @@ pub unsafe fn _mm_mask_expandloadu_epi16( k: __mmask8, mem_addr: *const i16, ) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vpexpandw {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadw_128(mem_addr, src.as_i16x8(), k)) } /// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandw))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vpexpandw {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_expandloadu_epi16(_mm_setzero_si128(), k, mem_addr) } /// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")] +#[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpexpandb))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_mask_expandloadu_epi8( @@ -145,41 +96,25 @@ pub unsafe fn _mm512_mask_expandloadu_epi8( k: __mmask64, mem_addr: *const i8, ) -> __m512i { - let mut dst: __m512i = src; - asm!( - vpl!("vpexpandb {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadb_512(mem_addr, src.as_i8x64(), k)) } /// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")] +#[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpexpandb))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i { - let mut dst: __m512i; - asm!( - vpl!("vpexpandb {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(zmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm512_mask_expandloadu_epi8(_mm512_setzero_si512(), k, mem_addr) } /// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2,avx512vl,avx")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandb))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_mask_expandloadu_epi8( @@ -187,41 +122,25 @@ pub unsafe fn _mm256_mask_expandloadu_epi8( k: __mmask32, mem_addr: *const i8, ) -> __m256i { - let mut dst: __m256i = src; - asm!( - vpl!("vpexpandb {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadb_256(mem_addr, src.as_i8x32(), k)) } /// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2,avx512vl,avx")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandb))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i { - let mut dst: __m256i; - asm!( - vpl!("vpexpandb {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(ymm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm256_mask_expandloadu_epi8(_mm256_setzero_si256(), k, mem_addr) } /// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandb))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_mask_expandloadu_epi8( @@ -229,34 +148,18 @@ pub unsafe fn _mm_mask_expandloadu_epi8( k: __mmask16, mem_addr: *const i8, ) -> __m128i { - let mut dst: __m128i = src; - asm!( - vpl!("vpexpandb {dst}{{{k}}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = inout(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + transmute(expandloadb_128(mem_addr, src.as_i8x16(), k)) } /// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")] +#[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpexpandb))] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i { - let mut dst: __m128i; - asm!( - vpl!("vpexpandb {dst}{{{k}}} {{z}}"), - p = in(reg) mem_addr, - k = in(kreg) k, - dst = out(xmm_reg) dst, - options(pure, readonly, nostack, preserves_flags) - ); - dst + _mm_mask_expandloadu_epi8(_mm_setzero_si128(), k, mem_addr) } /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr. @@ -2523,6 +2426,19 @@ extern "C" { fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16; #[link_name = "llvm.fshr.v8i16"] fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8; + + #[link_name = "llvm.x86.avx512.mask.expand.load.b.128"] + fn expandloadb_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16; + #[link_name = "llvm.x86.avx512.mask.expand.load.w.128"] + fn expandloadw_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8; + #[link_name = "llvm.x86.avx512.mask.expand.load.b.256"] + fn expandloadb_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32; + #[link_name = "llvm.x86.avx512.mask.expand.load.w.256"] + fn expandloadw_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16; + #[link_name = "llvm.x86.avx512.mask.expand.load.b.512"] + fn expandloadb_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64; + #[link_name = "llvm.x86.avx512.mask.expand.load.w.512"] + fn expandloadw_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32; } #[cfg(test)]