Fix the stream intrinsics
They should use a platform-specific address management.
This commit is contained in:
parent
1f3264848f
commit
1c3b3b80c0
9 changed files with 82 additions and 59 deletions
|
|
@ -1738,8 +1738,8 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
|
||||
crate::arch::asm!(
|
||||
"vmovntdq [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("vmovntdq", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(ymm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
@ -1766,8 +1766,8 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
|
||||
crate::arch::asm!(
|
||||
"vmovntpd [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("vmovntpd", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(ymm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
@ -1795,8 +1795,8 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
|
||||
crate::arch::asm!(
|
||||
"vmovntps [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("vmovntps", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(ymm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
|
|||
|
|
@ -3149,9 +3149,9 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
|
|||
pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
|
||||
let dst: __m256i;
|
||||
crate::arch::asm!(
|
||||
"vmovntdqa {a}, [{mem_addr}]",
|
||||
vpl!("vmovntdqa {a}"),
|
||||
a = out(ymm_reg) dst,
|
||||
mem_addr = in(reg) mem_addr,
|
||||
p = in(reg) mem_addr,
|
||||
options(pure, readonly, nostack, preserves_flags),
|
||||
);
|
||||
dst
|
||||
|
|
|
|||
|
|
@ -8,8 +8,6 @@ use crate::{
|
|||
#[cfg(test)]
|
||||
use stdarch_test::assert_instr;
|
||||
|
||||
use super::avx512f::{vpl, vps};
|
||||
|
||||
/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)
|
||||
|
|
|
|||
|
|
@ -6,37 +6,6 @@ use crate::{
|
|||
mem, ptr,
|
||||
};
|
||||
|
||||
// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
|
||||
// register name (e.g. rax). We have to explicitly override the placeholder to
|
||||
// use the 32-bit register name in that case.
|
||||
|
||||
#[cfg(target_pointer_width = "32")]
|
||||
macro_rules! vpl {
|
||||
($inst:expr) => {
|
||||
concat!($inst, ", [{p:e}]")
|
||||
};
|
||||
}
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
macro_rules! vpl {
|
||||
($inst:expr) => {
|
||||
concat!($inst, ", [{p}]")
|
||||
};
|
||||
}
|
||||
#[cfg(target_pointer_width = "32")]
|
||||
macro_rules! vps {
|
||||
($inst1:expr, $inst2:expr) => {
|
||||
concat!($inst1, " [{p:e}]", $inst2)
|
||||
};
|
||||
}
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
macro_rules! vps {
|
||||
($inst1:expr, $inst2:expr) => {
|
||||
concat!($inst1, " [{p}]", $inst2)
|
||||
};
|
||||
}
|
||||
|
||||
pub(crate) use {vpl, vps};
|
||||
|
||||
#[cfg(test)]
|
||||
use stdarch_test::assert_instr;
|
||||
|
||||
|
|
@ -27899,8 +27868,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
|
||||
crate::arch::asm!(
|
||||
"vmovntps [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("vmovntps", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(zmm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
@ -27925,8 +27894,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
|
||||
crate::arch::asm!(
|
||||
"vmovntpd [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("vmovntpd", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(zmm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
@ -27951,13 +27920,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) {
|
||||
crate::arch::asm!(
|
||||
"vmovntdq [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("vmovntdq", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(zmm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
}
|
||||
|
||||
/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
|
||||
/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
|
||||
/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
|
||||
let dst: __m512i;
|
||||
crate::arch::asm!(
|
||||
vpl!("vmovntdqa {a}"),
|
||||
a = out(zmm_reg) dst,
|
||||
p = in(reg) mem_addr,
|
||||
options(pure, readonly, nostack, preserves_flags),
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
/// Sets packed 32-bit integers in `dst` with the supplied values.
|
||||
///
|
||||
/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
|
||||
|
|
@ -54566,6 +54554,13 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_stream_load_si512() {
|
||||
let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
|
||||
assert_eq_m512i(a, r);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512f")]
|
||||
unsafe fn test_mm512_reduce_add_epi32() {
|
||||
let a = _mm512_set1_epi32(1);
|
||||
|
|
|
|||
|
|
@ -57,3 +57,33 @@ macro_rules! assert_approx_eq {
|
|||
);
|
||||
}};
|
||||
}
|
||||
|
||||
// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
|
||||
// register name (e.g. rax). We have to explicitly override the placeholder to
|
||||
// use the 32-bit register name in that case.
|
||||
|
||||
#[cfg(target_pointer_width = "32")]
|
||||
macro_rules! vpl {
|
||||
($inst:expr) => {
|
||||
concat!($inst, ", [{p:e}]")
|
||||
};
|
||||
}
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
macro_rules! vpl {
|
||||
($inst:expr) => {
|
||||
concat!($inst, ", [{p}]")
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(target_pointer_width = "32")]
|
||||
macro_rules! vps {
|
||||
($inst1:expr, $inst2:expr) => {
|
||||
concat!($inst1, " [{p:e}]", $inst2)
|
||||
};
|
||||
}
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
macro_rules! vps {
|
||||
($inst1:expr, $inst2:expr) => {
|
||||
concat!($inst1, " [{p}]", $inst2)
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1992,8 +1992,8 @@ extern "C" {
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
|
||||
crate::arch::asm!(
|
||||
"movntps [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("movntps", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(xmm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1312,8 +1312,8 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
|
||||
crate::arch::asm!(
|
||||
"movntdq [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("movntdq", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(xmm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
@ -1339,8 +1339,8 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
|
||||
crate::arch::asm!(
|
||||
"movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("movnti", ",{a:e}"), // `:e` for 32bit value
|
||||
p = in(reg) mem_addr,
|
||||
a = in(reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
@ -2542,8 +2542,8 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
|
|||
#[allow(clippy::cast_ptr_alignment)]
|
||||
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
|
||||
crate::arch::asm!(
|
||||
"movntpd [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
vps!("movntpd", ",{a}"),
|
||||
p = in(reg) mem_addr,
|
||||
a = in(xmm_reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1154,9 +1154,9 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
|
|||
pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
|
||||
let dst: __m128i;
|
||||
crate::arch::asm!(
|
||||
"movntdqa {a}, [{mem_addr}]",
|
||||
vpl!("movntdqa {a}"),
|
||||
a = out(xmm_reg) dst,
|
||||
mem_addr = in(reg) mem_addr,
|
||||
p = in(reg) mem_addr,
|
||||
options(pure, readonly, nostack, preserves_flags),
|
||||
);
|
||||
dst
|
||||
|
|
|
|||
|
|
@ -79,8 +79,8 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
|
||||
crate::arch::asm!(
|
||||
"movnti [{mem_addr}], {a}",
|
||||
mem_addr = in(reg) mem_addr,
|
||||
"movnti [{p}], {a}",
|
||||
p = in(reg) mem_addr,
|
||||
a = in(reg) a,
|
||||
options(nostack, preserves_flags),
|
||||
);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue