Implement avx512 compressstore intrinsics (#1273)
This commit is contained in:
parent
7c81ce0606
commit
929f6e8409
2 changed files with 292 additions and 12 deletions
|
|
@ -1629,18 +1629,18 @@
|
|||
* [x] [`_mm_maskz_compress_pd`]
|
||||
* [x] [`_mm256_mask_compress_pd`]
|
||||
* [x] [`_mm256_maskz_compress_pd`]
|
||||
* [ ] [`_mm512_mask_compressstoreu_epi32`] //need i1
|
||||
* [_] [`_mm_mask_compressstoreu_epi32`] //need i1
|
||||
* [_] [`_mm256_mask_compressstoreu_epi32`] //need i1
|
||||
* [ ] [`_mm512_mask_compressstoreu_epi64`] //need i1
|
||||
* [_] [`_mm_mask_compressstoreu_epi64`] //need i1
|
||||
* [_] [`_mm256_mask_compressstoreu_epi64`] //need i1
|
||||
* [ ] [`_mm512_mask_compressstoreu_ps`] //need i1
|
||||
* [_] [`_mm_mask_compressstoreu_ps`] //need i1
|
||||
* [_] [`_mm256_mask_compressstoreu_ps`] //need i1
|
||||
* [ ] [`_mm512_mask_compressstoreu_pd`] //need i1
|
||||
* [_] [`_mm_mask_compressstoreu_pd`] //need i1
|
||||
* [_] [`_mm256_mask_compressstoreu_pd`] //need i1
|
||||
* [x] [`_mm512_mask_compressstoreu_epi32`] //need i1
|
||||
* [x] [`_mm_mask_compressstoreu_epi32`] //need i1
|
||||
* [x] [`_mm256_mask_compressstoreu_epi32`] //need i1
|
||||
* [x] [`_mm512_mask_compressstoreu_epi64`] //need i1
|
||||
* [x] [`_mm_mask_compressstoreu_epi64`] //need i1
|
||||
* [x] [`_mm256_mask_compressstoreu_epi64`] //need i1
|
||||
* [x] [`_mm512_mask_compressstoreu_ps`] //need i1
|
||||
* [x] [`_mm_mask_compressstoreu_ps`] //need i1
|
||||
* [x] [`_mm256_mask_compressstoreu_ps`] //need i1
|
||||
* [x] [`_mm512_mask_compressstoreu_pd`] //need i1
|
||||
* [x] [`_mm_mask_compressstoreu_pd`] //need i1
|
||||
* [x] [`_mm256_mask_compressstoreu_pd`] //need i1
|
||||
* [x] [`_mm512_mask_expand_epi32`]
|
||||
* [x] [`_mm512_maskz_expand_epi32`]
|
||||
* [x] [`_mm_mask_expand_epi32`]
|
||||
|
|
|
|||
|
|
@ -16156,6 +16156,126 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
|
|||
transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
|
||||
}
|
||||
|
||||
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressd))]
|
||||
pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) {
|
||||
vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressd))]
|
||||
pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) {
|
||||
vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressd))]
|
||||
pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) {
|
||||
vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressq))]
|
||||
pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) {
|
||||
vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressq))]
|
||||
pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) {
|
||||
vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressq))]
|
||||
pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) {
|
||||
vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(test, assert_instr(vcompressps))]
|
||||
pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) {
|
||||
vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vcompressps))]
|
||||
pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) {
|
||||
vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vcompressps))]
|
||||
pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) {
|
||||
vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[cfg_attr(test, assert_instr(vcompresspd))]
|
||||
pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) {
|
||||
vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vcompresspd))]
|
||||
pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) {
|
||||
vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
|
||||
}
|
||||
|
||||
/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f,avx512vl")]
|
||||
#[cfg_attr(test, assert_instr(vcompresspd))]
|
||||
pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) {
|
||||
vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
|
||||
}
|
||||
|
||||
/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316)
|
||||
|
|
@ -38007,6 +38127,34 @@ extern "C" {
|
|||
#[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
|
||||
fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
|
||||
fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
|
||||
fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
|
||||
fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
|
||||
fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
|
||||
fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
|
||||
fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
|
||||
fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
|
||||
fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
|
||||
fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
|
||||
fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
|
||||
fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
|
||||
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
|
||||
fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);
|
||||
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.d.512"]
|
||||
fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
|
||||
#[link_name = "llvm.x86.avx512.mask.expand.d.256"]
|
||||
|
|
@ -51357,6 +51505,138 @@ mod tests {
|
|||
assert_eq_m128(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_mask_compressstoreu_epi32() {
|
||||
let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let mut r = [0_i32; 16];
|
||||
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_i32; 16]);
|
||||
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
|
||||
assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm256_mask_compressstoreu_epi32() {
|
||||
let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let mut r = [0_i32; 8];
|
||||
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_i32; 8]);
|
||||
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a);
|
||||
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm_mask_compressstoreu_epi32() {
|
||||
let a = _mm_setr_epi32(1, 2, 3, 4);
|
||||
let mut r = [0_i32; 4];
|
||||
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_i32; 4]);
|
||||
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a);
|
||||
assert_eq!(&r, &[1, 2, 4, 0]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_mask_compressstoreu_epi64() {
|
||||
let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let mut r = [0_i64; 8];
|
||||
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_i64; 8]);
|
||||
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a);
|
||||
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm256_mask_compressstoreu_epi64() {
|
||||
let a = _mm256_setr_epi64x(1, 2, 3, 4);
|
||||
let mut r = [0_i64; 4];
|
||||
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_i64; 4]);
|
||||
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a);
|
||||
assert_eq!(&r, &[1, 2, 4, 0]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm_mask_compressstoreu_epi64() {
|
||||
let a = _mm_setr_epi64x(1, 2);
|
||||
let mut r = [0_i64; 2];
|
||||
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_i64; 2]);
|
||||
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a);
|
||||
assert_eq!(&r, &[2, 0]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_mask_compressstoreu_ps() {
|
||||
let a = _mm512_setr_ps(
|
||||
1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
|
||||
13_f32, 14_f32, 15_f32, 16_f32,
|
||||
);
|
||||
let mut r = [0_f32; 16];
|
||||
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_f32; 16]);
|
||||
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
|
||||
assert_eq!(
|
||||
&r,
|
||||
&[
|
||||
2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
|
||||
0_f32, 0_f32, 0_f32, 0_f32, 0_f32
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm256_mask_compressstoreu_ps() {
|
||||
let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
|
||||
let mut r = [0_f32; 8];
|
||||
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0_f32; 8]);
|
||||
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a);
|
||||
assert_eq!(
|
||||
&r,
|
||||
&[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm_mask_compressstoreu_ps() {
|
||||
let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
|
||||
let mut r = [0.; 4];
|
||||
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0.; 4]);
|
||||
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a);
|
||||
assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_mask_compressstoreu_pd() {
|
||||
let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
let mut r = [0.; 8];
|
||||
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0.; 8]);
|
||||
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a);
|
||||
assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm256_mask_compressstoreu_pd() {
|
||||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||||
let mut r = [0.; 4];
|
||||
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0.; 4]);
|
||||
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a);
|
||||
assert_eq!(&r, &[1., 2., 4., 0.]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f,avx512vl")]
|
||||
unsafe fn test_mm_mask_compressstoreu_pd() {
|
||||
let a = _mm_setr_pd(1., 2.);
|
||||
let mut r = [0.; 2];
|
||||
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
|
||||
assert_eq!(&r, &[0.; 2]);
|
||||
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a);
|
||||
assert_eq!(&r, &[2., 0.]);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_mask_expand_epi32() {
|
||||
let src = _mm512_set1_epi32(200);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue