Use LLVM intrinsics for masked load/stores, expand-loads and fp-class
Also, remove some redundant sse target-features from avx intrinsics
This commit is contained in:
parent
ba9e8be05e
commit
aa84427fd4
5 changed files with 479 additions and 1240 deletions
|
|
@ -737,7 +737,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -767,7 +767,7 @@ pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -799,7 +799,7 @@ pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -816,7 +816,7 @@ pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -1093,7 +1093,7 @@ pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -1163,7 +1163,7 @@ pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -2733,7 +2733,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic is only used for compilation and does not generate any
|
||||
// instructions, thus it has zero latency.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -2747,7 +2747,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic is only used for compilation and does not generate any
|
||||
// instructions, thus it has zero latency.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -2764,7 +2764,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic is only used for compilation and does not generate any
|
||||
// instructions, thus it has zero latency.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
|
|
@ -2888,7 +2888,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
|
||||
|
|
@ -2903,7 +2903,7 @@ pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m2
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
|
||||
|
|
@ -2917,7 +2917,7 @@ pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
|
||||
|
|
@ -2932,7 +2932,7 @@ pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
|
||||
|
|
@ -2949,7 +2949,7 @@ pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256)
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
|
||||
|
|
@ -2965,7 +2965,7 @@ pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,sse2")]
|
||||
#[target_feature(enable = "avx")]
|
||||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
use crate::{
|
||||
arch::asm,
|
||||
core_arch::{simd::*, x86::*},
|
||||
intrinsics::simd::*,
|
||||
ptr,
|
||||
|
|
@ -5396,19 +5395,11 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw")]
|
||||
#[target_feature(enable = "avx512bw")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i {
|
||||
let mut dst: __m512i = src;
|
||||
asm!(
|
||||
vpl!("vmovdqu16 {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(loaddqu16_512(mem_addr, src.as_i16x32(), k))
|
||||
}
|
||||
|
||||
/// Load packed 16-bit integers from memory into dst using zeromask k
|
||||
|
|
@ -5417,19 +5408,11 @@ pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *con
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw")]
|
||||
#[target_feature(enable = "avx512bw")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
|
||||
let mut dst: __m512i;
|
||||
asm!(
|
||||
vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm512_mask_loadu_epi16(_mm512_setzero_si512(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load packed 8-bit integers from memory into dst using writemask k
|
||||
|
|
@ -5438,19 +5421,11 @@ pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw")]
|
||||
#[target_feature(enable = "avx512bw")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i {
|
||||
let mut dst: __m512i = src;
|
||||
asm!(
|
||||
vpl!("vmovdqu8 {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(loaddqu8_512(mem_addr, src.as_i8x64(), k))
|
||||
}
|
||||
|
||||
/// Load packed 8-bit integers from memory into dst using zeromask k
|
||||
|
|
@ -5459,19 +5434,11 @@ pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *cons
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_loadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw")]
|
||||
#[target_feature(enable = "avx512bw")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
|
||||
let mut dst: __m512i;
|
||||
asm!(
|
||||
vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm512_mask_loadu_epi8(_mm512_setzero_si512(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load packed 16-bit integers from memory into dst using writemask k
|
||||
|
|
@ -5480,19 +5447,11 @@ pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m5
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i {
|
||||
let mut dst: __m256i = src;
|
||||
asm!(
|
||||
vpl!("vmovdqu16 {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(loaddqu16_256(mem_addr, src.as_i16x16(), k))
|
||||
}
|
||||
|
||||
/// Load packed 16-bit integers from memory into dst using zeromask k
|
||||
|
|
@ -5501,19 +5460,11 @@ pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *con
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
|
||||
let mut dst: __m256i;
|
||||
asm!(
|
||||
vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm256_mask_loadu_epi16(_mm256_setzero_si256(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load packed 8-bit integers from memory into dst using writemask k
|
||||
|
|
@ -5522,19 +5473,11 @@ pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_loadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i {
|
||||
let mut dst: __m256i = src;
|
||||
asm!(
|
||||
vpl!("vmovdqu8 {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(loaddqu8_256(mem_addr, src.as_i8x32(), k))
|
||||
}
|
||||
|
||||
/// Load packed 8-bit integers from memory into dst using zeromask k
|
||||
|
|
@ -5543,19 +5486,11 @@ pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *cons
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_loadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
|
||||
let mut dst: __m256i;
|
||||
asm!(
|
||||
vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm256_mask_loadu_epi8(_mm256_setzero_si256(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load packed 16-bit integers from memory into dst using writemask k
|
||||
|
|
@ -5564,19 +5499,11 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i {
|
||||
let mut dst: __m128i = src;
|
||||
asm!(
|
||||
vpl!("vmovdqu16 {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(loaddqu16_128(mem_addr, src.as_i16x8(), k))
|
||||
}
|
||||
|
||||
/// Load packed 16-bit integers from memory into dst using zeromask k
|
||||
|
|
@ -5585,19 +5512,11 @@ pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
|
||||
let mut dst: __m128i;
|
||||
asm!(
|
||||
vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm_mask_loadu_epi16(_mm_setzero_si128(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load packed 8-bit integers from memory into dst using writemask k
|
||||
|
|
@ -5606,19 +5525,11 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_loadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i {
|
||||
let mut dst: __m128i = src;
|
||||
asm!(
|
||||
vpl!("vmovdqu8 {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(loaddqu8_128(mem_addr, src.as_i8x16(), k))
|
||||
}
|
||||
|
||||
/// Load packed 8-bit integers from memory into dst using zeromask k
|
||||
|
|
@ -5627,19 +5538,11 @@ pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_loadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
|
||||
let mut dst: __m128i;
|
||||
asm!(
|
||||
vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm_mask_loadu_epi8(_mm_setzero_si128(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Store packed 16-bit integers from a into memory using writemask k.
|
||||
|
|
@ -5647,17 +5550,11 @@ pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw")]
|
||||
#[target_feature(enable = "avx512bw")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) {
|
||||
asm!(
|
||||
vps!("vmovdqu16", "{{{mask}}}, {a}"),
|
||||
p = in(reg) mem_addr,
|
||||
mask = in(kreg) mask,
|
||||
a = in(zmm_reg) a,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
storedqu16_512(mem_addr, a.as_i16x32(), mask)
|
||||
}
|
||||
|
||||
/// Store packed 8-bit integers from a into memory using writemask k.
|
||||
|
|
@ -5665,17 +5562,11 @@ pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: _
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw")]
|
||||
#[target_feature(enable = "avx512bw")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) {
|
||||
asm!(
|
||||
vps!("vmovdqu8", "{{{mask}}}, {a}"),
|
||||
p = in(reg) mem_addr,
|
||||
mask = in(kreg) mask,
|
||||
a = in(zmm_reg) a,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
storedqu8_512(mem_addr, a.as_i8x64(), mask)
|
||||
}
|
||||
|
||||
/// Store packed 16-bit integers from a into memory using writemask k.
|
||||
|
|
@ -5683,17 +5574,11 @@ pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) {
|
||||
asm!(
|
||||
vps!("vmovdqu16", "{{{mask}}}, {a}"),
|
||||
p = in(reg) mem_addr,
|
||||
mask = in(kreg) mask,
|
||||
a = in(ymm_reg) a,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
storedqu16_256(mem_addr, a.as_i16x16(), mask)
|
||||
}
|
||||
|
||||
/// Store packed 8-bit integers from a into memory using writemask k.
|
||||
|
|
@ -5701,17 +5586,11 @@ pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: _
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_storeu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) {
|
||||
asm!(
|
||||
vps!("vmovdqu8", "{{{mask}}}, {a}"),
|
||||
p = in(reg) mem_addr,
|
||||
mask = in(kreg) mask,
|
||||
a = in(ymm_reg) a,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
storedqu8_256(mem_addr, a.as_i8x32(), mask)
|
||||
}
|
||||
|
||||
/// Store packed 16-bit integers from a into memory using writemask k.
|
||||
|
|
@ -5719,17 +5598,11 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu16))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) {
|
||||
asm!(
|
||||
vps!("vmovdqu16", "{{{mask}}}, {a}"),
|
||||
p = in(reg) mem_addr,
|
||||
mask = in(kreg) mask,
|
||||
a = in(xmm_reg) a,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
storedqu16_128(mem_addr, a.as_i16x8(), mask)
|
||||
}
|
||||
|
||||
/// Store packed 8-bit integers from a into memory using writemask k.
|
||||
|
|
@ -5737,17 +5610,11 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_storeu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512bw,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vmovdqu8))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) {
|
||||
asm!(
|
||||
vps!("vmovdqu8", "{{{mask}}}, {a}"),
|
||||
p = in(reg) mem_addr,
|
||||
mask = in(kreg) mask,
|
||||
a = in(xmm_reg) a,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
storedqu8_128(mem_addr, a.as_i8x16(), mask)
|
||||
}
|
||||
|
||||
/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
|
||||
|
|
@ -11753,6 +11620,33 @@ extern "C" {
|
|||
fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
|
||||
#[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"]
|
||||
fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.loadu.b.128"]
|
||||
fn loaddqu8_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
|
||||
#[link_name = "llvm.x86.avx512.mask.loadu.w.128"]
|
||||
fn loaddqu16_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
|
||||
#[link_name = "llvm.x86.avx512.mask.loadu.b.256"]
|
||||
fn loaddqu8_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
|
||||
#[link_name = "llvm.x86.avx512.mask.loadu.w.256"]
|
||||
fn loaddqu16_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx512.mask.loadu.b.512"]
|
||||
fn loaddqu8_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
|
||||
#[link_name = "llvm.x86.avx512.mask.loadu.w.512"]
|
||||
fn loaddqu16_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.storeu.b.128"]
|
||||
fn storedqu8_128(mem_addr: *mut i8, a: i8x16, mask: u16);
|
||||
#[link_name = "llvm.x86.avx512.mask.storeu.w.128"]
|
||||
fn storedqu16_128(mem_addr: *mut i16, a: i16x8, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.storeu.b.256"]
|
||||
fn storedqu8_256(mem_addr: *mut i8, a: i8x32, mask: u32);
|
||||
#[link_name = "llvm.x86.avx512.mask.storeu.w.256"]
|
||||
fn storedqu16_256(mem_addr: *mut i16, a: i16x16, mask: u16);
|
||||
#[link_name = "llvm.x86.avx512.mask.storeu.b.512"]
|
||||
fn storedqu8_512(mem_addr: *mut i8, a: i8x64, mask: u64);
|
||||
#[link_name = "llvm.x86.avx512.mask.storeu.w.512"]
|
||||
fn storedqu16_512(mem_addr: *mut i16, a: i16x32, mask: u32);
|
||||
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
|
|
@ -6409,33 +6409,6 @@ pub unsafe fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __
|
|||
|
||||
// FP-Class
|
||||
|
||||
// FIXME: Use LLVM intrinsics instead of inline assembly
|
||||
macro_rules! fpclass_asm {
|
||||
($instr:literal, $mask_type: ty, $reg: ident, $a: expr) => {{
|
||||
let dst: $mask_type;
|
||||
$crate::arch::asm!(
|
||||
concat!($instr, " {k}, {src}, {imm8}"),
|
||||
k = lateout(kreg) dst,
|
||||
src = in($reg) $a,
|
||||
imm8 = const IMM8,
|
||||
options(pure, nomem, nostack)
|
||||
);
|
||||
dst
|
||||
}};
|
||||
($instr:literal, $mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
|
||||
let dst: $mask_type;
|
||||
$crate::arch::asm!(
|
||||
concat!($instr, " {k} {{ {mask} }}, {src}, {imm8}"),
|
||||
k = lateout(kreg) dst,
|
||||
mask = in(kreg) $mask,
|
||||
src = in($reg) $a,
|
||||
imm8 = const IMM8,
|
||||
options(pure, nomem, nostack)
|
||||
);
|
||||
dst
|
||||
}};
|
||||
}
|
||||
|
||||
/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
|
||||
/// by imm8, and store the results in mask vector k.
|
||||
/// imm can be a combination of:
|
||||
|
|
@ -6451,13 +6424,13 @@ macro_rules! fpclass_asm {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_pd_mask&ig_expand=3493)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclasspd", __mmask8, xmm_reg, a)
|
||||
_mm_mask_fpclass_pd_mask::<IMM8>(0xff, a)
|
||||
}
|
||||
|
||||
/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6476,13 +6449,13 @@ pub unsafe fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_pd_mask&ig_expand=3494)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclasspd", __mmask8, k1, xmm_reg, a)
|
||||
transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1))
|
||||
}
|
||||
|
||||
/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6500,13 +6473,13 @@ pub unsafe fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_pd_mask&ig_expand=3495)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclasspd", __mmask8, ymm_reg, a)
|
||||
_mm256_mask_fpclass_pd_mask::<IMM8>(0xff, a)
|
||||
}
|
||||
|
||||
/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6525,13 +6498,13 @@ pub unsafe fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_pd_mask&ig_expand=3496)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclasspd", __mmask8, k1, ymm_reg, a)
|
||||
transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1))
|
||||
}
|
||||
|
||||
/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6549,13 +6522,13 @@ pub unsafe fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m2
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_pd_mask&ig_expand=3497)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclasspd", __mmask8, zmm_reg, a)
|
||||
_mm512_mask_fpclass_pd_mask::<IMM8>(0xff, a)
|
||||
}
|
||||
|
||||
/// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6574,13 +6547,13 @@ pub unsafe fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_pd_mask&ig_expand=3498)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclasspd", __mmask8, k1, zmm_reg, a)
|
||||
transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1))
|
||||
}
|
||||
|
||||
/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6598,13 +6571,13 @@ pub unsafe fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m5
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ps_mask&ig_expand=3505)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclassps", __mmask8, xmm_reg, a)
|
||||
_mm_mask_fpclass_ps_mask::<IMM8>(0xff, a)
|
||||
}
|
||||
|
||||
/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6623,13 +6596,13 @@ pub unsafe fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ps_mask&ig_expand=3506)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclassps", __mmask8, k1, xmm_reg, a)
|
||||
transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1))
|
||||
}
|
||||
|
||||
/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6647,13 +6620,13 @@ pub unsafe fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128)
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fpclass_ps_mask&ig_expand=3507)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclassps", __mmask8, ymm_reg, a)
|
||||
_mm256_mask_fpclass_ps_mask::<IMM8>(0xff, a)
|
||||
}
|
||||
|
||||
/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6672,13 +6645,13 @@ pub unsafe fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_fpclass_ps_mask&ig_expand=3508)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx,avx512f,avx512dq,avx512vl")]
|
||||
#[target_feature(enable = "avx512dq,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256) -> __mmask8 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclassps", __mmask8, k1, ymm_reg, a)
|
||||
transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1))
|
||||
}
|
||||
|
||||
/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6696,13 +6669,13 @@ pub unsafe fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m2
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_fpclass_ps_mask&ig_expand=3509)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclassps", __mmask16, zmm_reg, a)
|
||||
_mm512_mask_fpclass_ps_mask::<IMM8>(0xffff, a)
|
||||
}
|
||||
|
||||
/// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
|
||||
|
|
@ -6721,13 +6694,13 @@ pub unsafe fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_fpclass_ps_mask&ig_expand=3510)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512) -> __mmask16 {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
fpclass_asm!("vfpclassps", __mmask16, k1, zmm_reg, a)
|
||||
transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1))
|
||||
}
|
||||
|
||||
/// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
|
||||
|
|
@ -6745,7 +6718,7 @@ pub unsafe fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_sd_mask&ig_expand=3511)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -6770,7 +6743,7 @@ pub unsafe fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_sd_mask&ig_expand=3512)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -6794,7 +6767,7 @@ pub unsafe fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fpclass_ss_mask&ig_expand=3515)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(1)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -6819,7 +6792,7 @@ pub unsafe fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
|
|||
///
|
||||
/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_fpclass_ss_mask&ig_expand=3516)
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse,avx512f,avx512dq")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
#[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
|
||||
#[rustc_legacy_const_generics(2)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
|
|
@ -6953,6 +6926,20 @@ extern "C" {
|
|||
#[link_name = "llvm.x86.avx512.mask.reduce.ss"]
|
||||
fn vreducess(a: f32x4, b: f32x4, src: f32x4, k: __mmask8, imm8: i32, sae: i32) -> f32x4;
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.pd.128"]
|
||||
fn vfpclasspd_128(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.pd.256"]
|
||||
fn vfpclasspd_256(a: f64x4, imm8: i32, k: __mmask8) -> __mmask8;
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.pd.512"]
|
||||
fn vfpclasspd_512(a: f64x8, imm8: i32, k: __mmask8) -> __mmask8;
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.ps.128"]
|
||||
fn vfpclassps_128(a: f32x4, imm8: i32, k: __mmask8) -> __mmask8;
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.ps.256"]
|
||||
fn vfpclassps_256(a: f32x8, imm8: i32, k: __mmask8) -> __mmask8;
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.ps.512"]
|
||||
fn vfpclassps_512(a: f32x16, imm8: i32, k: __mmask16) -> __mmask16;
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.sd"]
|
||||
fn vfpclasssd(a: f64x2, imm8: i32, k: __mmask8) -> __mmask8;
|
||||
#[link_name = "llvm.x86.avx512.mask.fpclass.ss"]
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,4 @@
|
|||
use crate::{
|
||||
arch::asm,
|
||||
core_arch::{simd::*, x86::*},
|
||||
intrinsics::simd::*,
|
||||
};
|
||||
|
|
@ -11,7 +10,7 @@ use stdarch_test::assert_instr;
|
|||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
|
||||
#[target_feature(enable = "avx512vbmi2")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_expandloadu_epi16(
|
||||
|
|
@ -19,41 +18,25 @@ pub unsafe fn _mm512_mask_expandloadu_epi16(
|
|||
k: __mmask32,
|
||||
mem_addr: *const i16,
|
||||
) -> __m512i {
|
||||
let mut dst: __m512i = src;
|
||||
asm!(
|
||||
vpl!("vpexpandw {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(expandloadw_512(mem_addr, src.as_i16x32(), k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
|
||||
#[target_feature(enable = "avx512vbmi2")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_expandloadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
|
||||
let mut dst: __m512i;
|
||||
asm!(
|
||||
vpl!("vpexpandw {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm512_mask_expandloadu_epi16(_mm512_setzero_si512(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_expandloadu_epi16(
|
||||
|
|
@ -61,41 +44,25 @@ pub unsafe fn _mm256_mask_expandloadu_epi16(
|
|||
k: __mmask16,
|
||||
mem_addr: *const i16,
|
||||
) -> __m256i {
|
||||
let mut dst: __m256i = src;
|
||||
asm!(
|
||||
vpl!("vpexpandw {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(expandloadw_256(mem_addr, src.as_i16x16(), k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_expandloadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
|
||||
let mut dst: __m256i;
|
||||
asm!(
|
||||
vpl!("vpexpandw {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm256_mask_expandloadu_epi16(_mm256_setzero_si256(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_expandloadu_epi16(
|
||||
|
|
@ -103,41 +70,25 @@ pub unsafe fn _mm_mask_expandloadu_epi16(
|
|||
k: __mmask8,
|
||||
mem_addr: *const i16,
|
||||
) -> __m128i {
|
||||
let mut dst: __m128i = src;
|
||||
asm!(
|
||||
vpl!("vpexpandw {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(expandloadw_128(mem_addr, src.as_i16x8(), k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi16)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_expandloadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
|
||||
let mut dst: __m128i;
|
||||
asm!(
|
||||
vpl!("vpexpandw {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm_mask_expandloadu_epi16(_mm_setzero_si128(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
|
||||
#[target_feature(enable = "avx512vbmi2")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_mask_expandloadu_epi8(
|
||||
|
|
@ -145,41 +96,25 @@ pub unsafe fn _mm512_mask_expandloadu_epi8(
|
|||
k: __mmask64,
|
||||
mem_addr: *const i8,
|
||||
) -> __m512i {
|
||||
let mut dst: __m512i = src;
|
||||
asm!(
|
||||
vpl!("vpexpandb {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(expandloadb_512(mem_addr, src.as_i8x64(), k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_expandloadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2")]
|
||||
#[target_feature(enable = "avx512vbmi2")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_expandloadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
|
||||
let mut dst: __m512i;
|
||||
asm!(
|
||||
vpl!("vpexpandb {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(zmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm512_mask_expandloadu_epi8(_mm512_setzero_si512(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_expandloadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_mask_expandloadu_epi8(
|
||||
|
|
@ -187,41 +122,25 @@ pub unsafe fn _mm256_mask_expandloadu_epi8(
|
|||
k: __mmask32,
|
||||
mem_addr: *const i8,
|
||||
) -> __m256i {
|
||||
let mut dst: __m256i = src;
|
||||
asm!(
|
||||
vpl!("vpexpandb {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(expandloadb_256(mem_addr, src.as_i8x32(), k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_expandloadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi2,avx512vl,avx")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_expandloadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
|
||||
let mut dst: __m256i;
|
||||
asm!(
|
||||
vpl!("vpexpandb {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(ymm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm256_mask_expandloadu_epi8(_mm256_setzero_si256(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_expandloadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_mask_expandloadu_epi8(
|
||||
|
|
@ -229,34 +148,18 @@ pub unsafe fn _mm_mask_expandloadu_epi8(
|
|||
k: __mmask16,
|
||||
mem_addr: *const i8,
|
||||
) -> __m128i {
|
||||
let mut dst: __m128i = src;
|
||||
asm!(
|
||||
vpl!("vpexpandb {dst}{{{k}}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = inout(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
transmute(expandloadb_128(mem_addr, src.as_i8x16(), k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_expandloadu_epi8)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vbmi2,avx512vl,avx,sse")]
|
||||
#[target_feature(enable = "avx512vbmi2,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_expandloadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
|
||||
let mut dst: __m128i;
|
||||
asm!(
|
||||
vpl!("vpexpandb {dst}{{{k}}} {{z}}"),
|
||||
p = in(reg) mem_addr,
|
||||
k = in(kreg) k,
|
||||
dst = out(xmm_reg) dst,
|
||||
options(pure, readonly, nostack, preserves_flags)
|
||||
);
|
||||
dst
|
||||
_mm_mask_expandloadu_epi8(_mm_setzero_si128(), k, mem_addr)
|
||||
}
|
||||
|
||||
/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
|
|
@ -2523,6 +2426,19 @@ extern "C" {
|
|||
fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
|
||||
#[link_name = "llvm.fshr.v8i16"]
|
||||
fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.load.b.128"]
|
||||
fn expandloadb_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16;
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.load.w.128"]
|
||||
fn expandloadw_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8;
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.load.b.256"]
|
||||
fn expandloadb_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32;
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.load.w.256"]
|
||||
fn expandloadw_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.load.b.512"]
|
||||
fn expandloadb_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64;
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.load.w.512"]
|
||||
fn expandloadw_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue