Add 256-bit vectors and some SSE intrinsics.
This commit is contained in:
parent
df03dc4d80
commit
27e307278a
9 changed files with 687 additions and 69 deletions
|
|
@ -1,3 +1,167 @@
|
|||
**TIP**: Use the following command to generate a section in this list for
|
||||
Intel intrinsics. Replace `SSE4.2` with the intended type.
|
||||
|
||||
```
|
||||
rg '^<intrinsic' intel-intrinsics-3.3.15.xml | rg "'SSE4.2'" | rg '^.*name=\x27([^\x27]+)\x27.*$' -r '* [ ] `$1`' >> TODO.md
|
||||
```
|
||||
|
||||
|
||||
sse
|
||||
---
|
||||
* [ ] `_MM_TRANSPOSE4_PS`
|
||||
* [ ] `_mm_getcsr`
|
||||
* [ ] `_mm_setcsr`
|
||||
* [ ] `_MM_GET_EXCEPTION_STATE`
|
||||
* [ ] `_MM_SET_EXCEPTION_STATE`
|
||||
* [ ] `_MM_GET_EXCEPTION_MASK`
|
||||
* [ ] `_MM_SET_EXCEPTION_MASK`
|
||||
* [ ] `_MM_GET_ROUNDING_MODE`
|
||||
* [ ] `_MM_SET_ROUNDING_MODE`
|
||||
* [ ] `_MM_GET_FLUSH_ZERO_MODE`
|
||||
* [ ] `_MM_SET_FLUSH_ZERO_MODE`
|
||||
* [ ] `_mm_prefetch`
|
||||
* [ ] `_mm_sfence`
|
||||
* [ ] `_mm_max_pi16`
|
||||
* [ ] `_m_pmaxsw`
|
||||
* [ ] `_mm_max_pu8`
|
||||
* [ ] `_m_pmaxub`
|
||||
* [ ] `_mm_min_pi16`
|
||||
* [ ] `_m_pminsw`
|
||||
* [ ] `_mm_min_pu8`
|
||||
* [ ] `_m_pminub`
|
||||
* [ ] `_mm_mulhi_pu16`
|
||||
* [ ] `_m_pmulhuw`
|
||||
* [ ] `_mm_avg_pu8`
|
||||
* [ ] `_m_pavgb`
|
||||
* [ ] `_mm_avg_pu16`
|
||||
* [ ] `_m_pavgw`
|
||||
* [ ] `_mm_sad_pu8`
|
||||
* [ ] `_m_psadbw`
|
||||
* [ ] `_mm_cvtsi32_ss`
|
||||
* [ ] `_mm_cvt_si2ss`
|
||||
* [ ] `_mm_cvtsi64_ss`
|
||||
* [ ] `_mm_cvtpi32_ps`
|
||||
* [ ] `_mm_cvt_pi2ps`
|
||||
* [ ] `_mm_cvtpi16_ps`
|
||||
* [ ] `_mm_cvtpu16_ps`
|
||||
* [ ] `_mm_cvtpi8_ps`
|
||||
* [ ] `_mm_cvtpu8_ps`
|
||||
* [ ] `_mm_cvtpi32x2_ps`
|
||||
* [ ] `_mm_stream_pi`
|
||||
* [ ] `_mm_maskmove_si64`
|
||||
* [ ] `_m_maskmovq`
|
||||
* [ ] `_mm_extract_pi16`
|
||||
* [ ] `_m_pextrw`
|
||||
* [ ] `_mm_insert_pi16`
|
||||
* [ ] `_m_pinsrw`
|
||||
* [ ] `_mm_movemask_pi8`
|
||||
* [ ] `_m_pmovmskb`
|
||||
* [ ] `_mm_shuffle_pi16`
|
||||
* [ ] `_m_pshufw`
|
||||
* [ ] `_mm_add_ss`
|
||||
* [ ] `_mm_add_ps`
|
||||
* [ ] `_mm_sub_ss`
|
||||
* [ ] `_mm_sub_ps`
|
||||
* [ ] `_mm_mul_ss`
|
||||
* [ ] `_mm_mul_ps`
|
||||
* [ ] `_mm_div_ss`
|
||||
* [ ] `_mm_div_ps`
|
||||
* [ ] `_mm_sqrt_ss`
|
||||
* [x] `_mm_sqrt_ps`
|
||||
* [ ] `_mm_rcp_ss`
|
||||
* [x] `_mm_rcp_ps`
|
||||
* [ ] `_mm_rsqrt_ss`
|
||||
* [x] `_mm_rsqrt_ps`
|
||||
* [ ] `_mm_min_ss`
|
||||
* [x] `_mm_min_ps`
|
||||
* [ ] `_mm_max_ss`
|
||||
* [x] `_mm_max_ps`
|
||||
* [ ] `_mm_and_ps`
|
||||
* [ ] `_mm_andnot_ps`
|
||||
* [ ] `_mm_or_ps`
|
||||
* [ ] `_mm_xor_ps`
|
||||
* [ ] `_mm_cmpeq_ss`
|
||||
* [ ] `_mm_cmpeq_ps`
|
||||
* [ ] `_mm_cmplt_ss`
|
||||
* [ ] `_mm_cmplt_ps`
|
||||
* [ ] `_mm_cmple_ss`
|
||||
* [ ] `_mm_cmple_ps`
|
||||
* [ ] `_mm_cmpgt_ss`
|
||||
* [ ] `_mm_cmpgt_ps`
|
||||
* [ ] `_mm_cmpge_ss`
|
||||
* [ ] `_mm_cmpge_ps`
|
||||
* [ ] `_mm_cmpneq_ss`
|
||||
* [ ] `_mm_cmpneq_ps`
|
||||
* [ ] `_mm_cmpnlt_ss`
|
||||
* [ ] `_mm_cmpnlt_ps`
|
||||
* [ ] `_mm_cmpnle_ss`
|
||||
* [ ] `_mm_cmpnle_ps`
|
||||
* [ ] `_mm_cmpngt_ss`
|
||||
* [ ] `_mm_cmpngt_ps`
|
||||
* [ ] `_mm_cmpnge_ss`
|
||||
* [ ] `_mm_cmpnge_ps`
|
||||
* [ ] `_mm_cmpord_ss`
|
||||
* [ ] `_mm_cmpord_ps`
|
||||
* [ ] `_mm_cmpunord_ss`
|
||||
* [ ] `_mm_cmpunord_ps`
|
||||
* [ ] `_mm_comieq_ss`
|
||||
* [ ] `_mm_comilt_ss`
|
||||
* [ ] `_mm_comile_ss`
|
||||
* [ ] `_mm_comigt_ss`
|
||||
* [ ] `_mm_comige_ss`
|
||||
* [ ] `_mm_comineq_ss`
|
||||
* [ ] `_mm_ucomieq_ss`
|
||||
* [ ] `_mm_ucomilt_ss`
|
||||
* [ ] `_mm_ucomile_ss`
|
||||
* [ ] `_mm_ucomigt_ss`
|
||||
* [ ] `_mm_ucomige_ss`
|
||||
* [ ] `_mm_ucomineq_ss`
|
||||
* [ ] `_mm_cvtss_si32`
|
||||
* [ ] `_mm_cvt_ss2si`
|
||||
* [ ] `_mm_cvtss_si64`
|
||||
* [ ] `_mm_cvtss_f32`
|
||||
* [ ] `_mm_cvtps_pi32`
|
||||
* [ ] `_mm_cvt_ps2pi`
|
||||
* [ ] `_mm_cvttss_si32`
|
||||
* [ ] `_mm_cvtt_ss2si`
|
||||
* [ ] `_mm_cvttss_si64`
|
||||
* [ ] `_mm_cvttps_pi32`
|
||||
* [ ] `_mm_cvtt_ps2pi`
|
||||
* [ ] `_mm_cvtps_pi16`
|
||||
* [ ] `_mm_cvtps_pi8`
|
||||
* [ ] `_mm_set_ss`
|
||||
* [ ] `_mm_set1_ps`
|
||||
* [ ] `_mm_set_ps1`
|
||||
* [ ] `_mm_set_ps`
|
||||
* [ ] `_mm_setr_ps`
|
||||
* [ ] `_mm_setzero_ps`
|
||||
* [ ] `_mm_loadh_pi`
|
||||
* [ ] `_mm_loadl_pi`
|
||||
* [ ] `_mm_load_ss`
|
||||
* [ ] `_mm_load1_ps`
|
||||
* [ ] `_mm_load_ps1`
|
||||
* [ ] `_mm_load_ps`
|
||||
* [ ] `_mm_loadu_ps`
|
||||
* [ ] `_mm_loadr_ps`
|
||||
* [ ] `_mm_stream_ps`
|
||||
* [ ] `_mm_storeh_pi`
|
||||
* [ ] `_mm_storel_pi`
|
||||
* [ ] `_mm_store_ss`
|
||||
* [ ] `_mm_store1_ps`
|
||||
* [ ] `_mm_store_ps1`
|
||||
* [ ] `_mm_store_ps`
|
||||
* [ ] `_mm_storeu_ps`
|
||||
* [ ] `_mm_storer_ps`
|
||||
* [ ] `_mm_move_ss`
|
||||
* [ ] `_mm_shuffle_ps`
|
||||
* [ ] `_mm_unpackhi_ps`
|
||||
* [ ] `_mm_unpacklo_ps`
|
||||
* [ ] `_mm_movehl_ps`
|
||||
* [ ] `_mm_movelh_ps`
|
||||
* [x] `_mm_movemask_ps`
|
||||
* [ ] `_mm_undefined_ps`
|
||||
|
||||
|
||||
sse2
|
||||
----
|
||||
* [x] `_mm_pause`
|
||||
|
|
@ -221,7 +385,7 @@ sse2
|
|||
* [ ] `_mm_storel_pd`
|
||||
* [ ] `_mm_unpackhi_pd`
|
||||
* [ ] `_mm_unpacklo_pd`
|
||||
* [ ] `_mm_movemask_pd`
|
||||
* [x] `_mm_movemask_pd`
|
||||
* [ ] `_mm_shuffle_pd`
|
||||
* [ ] `_mm_move_sd`
|
||||
* [ ] `_mm_castpd_ps`
|
||||
|
|
@ -234,6 +398,21 @@ sse2
|
|||
* [ ] `_mm_undefined_si128`
|
||||
|
||||
|
||||
sse3
|
||||
----
|
||||
* [ ] `_mm_addsub_ps`
|
||||
* [ ] `_mm_addsub_pd`
|
||||
* [ ] `_mm_hadd_pd`
|
||||
* [ ] `_mm_hadd_ps`
|
||||
* [ ] `_mm_hsub_pd`
|
||||
* [ ] `_mm_hsub_ps`
|
||||
* [ ] `_mm_lddqu_si128`
|
||||
* [ ] `_mm_movedup_pd`
|
||||
* [ ] `_mm_loaddup_pd`
|
||||
* [ ] `_mm_movehdup_ps`
|
||||
* [ ] `_mm_moveldup_ps`
|
||||
|
||||
|
||||
ssse3
|
||||
-----
|
||||
* [ ] `_mm_abs_pi8`
|
||||
|
|
@ -268,3 +447,91 @@ ssse3
|
|||
* [ ] `_mm_sign_pi8`
|
||||
* [ ] `_mm_sign_pi16`
|
||||
* [ ] `_mm_sign_pi32`
|
||||
|
||||
|
||||
sse4.1
|
||||
------
|
||||
* [ ] `_mm_blend_pd`
|
||||
* [ ] `_mm_blend_ps`
|
||||
* [ ] `_mm_blendv_pd`
|
||||
* [ ] `_mm_blendv_ps`
|
||||
* [ ] `_mm_blendv_epi8`
|
||||
* [ ] `_mm_blend_epi16`
|
||||
* [ ] `_mm_dp_pd`
|
||||
* [ ] `_mm_dp_ps`
|
||||
* [ ] `_mm_extract_ps`
|
||||
* [ ] `_mm_extract_epi8`
|
||||
* [ ] `_mm_extract_epi32`
|
||||
* [ ] `_mm_extract_epi64`
|
||||
* [ ] `_mm_insert_ps`
|
||||
* [ ] `_mm_insert_epi8`
|
||||
* [ ] `_mm_insert_epi32`
|
||||
* [ ] `_mm_insert_epi64`
|
||||
* [ ] `_mm_max_epi8`
|
||||
* [ ] `_mm_max_epi32`
|
||||
* [ ] `_mm_max_epu32`
|
||||
* [ ] `_mm_max_epu16`
|
||||
* [ ] `_mm_min_epi8`
|
||||
* [ ] `_mm_min_epi32`
|
||||
* [ ] `_mm_min_epu32`
|
||||
* [ ] `_mm_min_epu16`
|
||||
* [ ] `_mm_packus_epi32`
|
||||
* [ ] `_mm_cmpeq_epi64`
|
||||
* [ ] `_mm_cvtepi8_epi16`
|
||||
* [ ] `_mm_cvtepi8_epi32`
|
||||
* [ ] `_mm_cvtepi8_epi64`
|
||||
* [ ] `_mm_cvtepi16_epi32`
|
||||
* [ ] `_mm_cvtepi16_epi64`
|
||||
* [ ] `_mm_cvtepi32_epi64`
|
||||
* [ ] `_mm_cvtepu8_epi16`
|
||||
* [ ] `_mm_cvtepu8_epi32`
|
||||
* [ ] `_mm_cvtepu8_epi64`
|
||||
* [ ] `_mm_cvtepu16_epi32`
|
||||
* [ ] `_mm_cvtepu16_epi64`
|
||||
* [ ] `_mm_cvtepu32_epi64`
|
||||
* [ ] `_mm_mul_epi32`
|
||||
* [ ] `_mm_mullo_epi32`
|
||||
* [ ] `_mm_testz_si128`
|
||||
* [ ] `_mm_testc_si128`
|
||||
* [ ] `_mm_testnzc_si128`
|
||||
* [ ] `_mm_test_all_zeros`
|
||||
* [ ] `_mm_test_mix_ones_zeros`
|
||||
* [ ] `_mm_test_all_ones`
|
||||
* [ ] `_mm_round_pd`
|
||||
* [ ] `_mm_floor_pd`
|
||||
* [ ] `_mm_ceil_pd`
|
||||
* [ ] `_mm_round_ps`
|
||||
* [ ] `_mm_floor_ps`
|
||||
* [ ] `_mm_ceil_ps`
|
||||
* [ ] `_mm_round_sd`
|
||||
* [ ] `_mm_floor_sd`
|
||||
* [ ] `_mm_ceil_sd`
|
||||
* [ ] `_mm_round_ss`
|
||||
* [ ] `_mm_floor_ss`
|
||||
* [ ] `_mm_ceil_ss`
|
||||
* [ ] `_mm_minpos_epu16`
|
||||
* [ ] `_mm_mpsadbw_epu8`
|
||||
* [ ] `_mm_stream_load_si128`
|
||||
|
||||
|
||||
sse4.2
|
||||
------
|
||||
* [ ] `_mm_cmpistrm`
|
||||
* [ ] `_mm_cmpistri`
|
||||
* [ ] `_mm_cmpistrz`
|
||||
* [ ] `_mm_cmpistrc`
|
||||
* [ ] `_mm_cmpistrs`
|
||||
* [ ] `_mm_cmpistro`
|
||||
* [ ] `_mm_cmpistra`
|
||||
* [ ] `_mm_cmpestrm`
|
||||
* [ ] `_mm_cmpestri`
|
||||
* [ ] `_mm_cmpestrz`
|
||||
* [ ] `_mm_cmpestrc`
|
||||
* [ ] `_mm_cmpestrs`
|
||||
* [ ] `_mm_cmpestro`
|
||||
* [ ] `_mm_cmpestra`
|
||||
* [ ] `_mm_cmpgt_epi64`
|
||||
* [ ] `_mm_crc32_u8`
|
||||
* [ ] `_mm_crc32_u16`
|
||||
* [ ] `_mm_crc32_u32`
|
||||
* [ ] `_mm_crc32_u64`
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
)]
|
||||
|
||||
pub use v128::*;
|
||||
pub use v256::*;
|
||||
pub use v64::*;
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub use x86::*;
|
||||
|
|
@ -13,6 +14,7 @@ pub use x86::*;
|
|||
mod macros;
|
||||
mod simd;
|
||||
mod v128;
|
||||
mod v256;
|
||||
mod v64;
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
mod x86;
|
||||
|
|
|
|||
|
|
@ -8,8 +8,10 @@ macro_rules! define_ty {
|
|||
}
|
||||
|
||||
macro_rules! define_impl {
|
||||
($name:ident, $elemty:ident, $nelems:expr,
|
||||
$($elname:ident),+) => {
|
||||
(
|
||||
$name:ident, $elemty:ident, $nelems:expr, $boolname:ident,
|
||||
$($elname:ident),+
|
||||
) => {
|
||||
impl $name {
|
||||
#[inline]
|
||||
pub fn new($($elname: $elemty),*) -> $name {
|
||||
|
|
@ -25,25 +27,46 @@ macro_rules! define_impl {
|
|||
}),*)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn extract(self, idx: u32) -> $elemty {
|
||||
assert!(idx < $nelems);
|
||||
unsafe { simd_extract(self, idx) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn insert(self, idx: u32, val: $elemty) -> $name {
|
||||
#[inline]
|
||||
pub fn replace(self, idx: u32, val: $elemty) -> $name {
|
||||
assert!(idx < $nelems);
|
||||
unsafe { simd_insert(self, idx, val) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn store(self, slice: &mut [$elemty], offset: usize) {
|
||||
assert!(slice[offset..].len() >= $nelems);
|
||||
unsafe { self.store_unchecked(slice, offset) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub unsafe fn store_unchecked(
|
||||
self,
|
||||
slice: &mut [$elemty],
|
||||
offset: usize,
|
||||
) {
|
||||
use std::mem::size_of;
|
||||
use std::ptr;
|
||||
|
||||
ptr::copy_nonoverlapping(
|
||||
&self as *const $name as *const u8,
|
||||
slice.get_unchecked_mut(offset) as *mut $elemty as *mut u8,
|
||||
size_of::<$name>());
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn load(slice: &[$elemty], offset: usize) -> $name {
|
||||
assert!(slice[offset..].len() >= $nelems);
|
||||
unsafe { $name::load_unchecked(slice, offset) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub unsafe fn load_unchecked(
|
||||
slice: &[$elemty],
|
||||
offset: usize,
|
||||
|
|
@ -58,6 +81,36 @@ macro_rules! define_impl {
|
|||
size_of::<$name>());
|
||||
x
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn eq(self, other: $name) -> $boolname {
|
||||
unsafe { simd_eq(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn ne(self, other: $name) -> $boolname {
|
||||
unsafe { simd_ne(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn lt(self, other: $name) -> $boolname {
|
||||
unsafe { simd_lt(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn le(self, other: $name) -> $boolname {
|
||||
unsafe { simd_le(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn gt(self, other: $name) -> $boolname {
|
||||
unsafe { simd_gt(self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn ge(self, other: $name) -> $boolname {
|
||||
unsafe { simd_ge(self, other) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -177,3 +230,15 @@ macro_rules! define_integer_ops {
|
|||
)+
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! define_casts {
|
||||
($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => {
|
||||
$(
|
||||
impl $ty {
|
||||
pub fn $floatcast(self) -> ::$floatty {
|
||||
unsafe { simd_cast(self) }
|
||||
}
|
||||
}
|
||||
)+
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,34 +1,34 @@
|
|||
use simd::*;
|
||||
|
||||
define_ty! { f64x2, f64, f64 }
|
||||
define_impl! { f64x2, f64, 2, x0, x1 }
|
||||
define_impl! { f64x2, f64, 2, i64x2, x0, x1 }
|
||||
|
||||
define_ty! { f32x4, f32, f32, f32, f32 }
|
||||
define_impl! { f32x4, f32, 4, x0, x1, x2, x3 }
|
||||
define_impl! { f32x4, f32, 4, i32x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { u64x2, u64, u64 }
|
||||
define_impl! { u64x2, u64, 2, x0, x1 }
|
||||
define_impl! { u64x2, u64, 2, i64x2, x0, x1 }
|
||||
|
||||
define_ty! { i64x2, i64, i64 }
|
||||
define_impl! { i64x2, i64, 2, x0, x1 }
|
||||
define_impl! { i64x2, i64, 2, i64x2, x0, x1 }
|
||||
|
||||
define_ty! { u32x4, u32, u32, u32, u32 }
|
||||
define_impl! { u32x4, u32, 4, x0, x1, x2, x3 }
|
||||
define_impl! { u32x4, u32, 4, i32x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { i32x4, i32, i32, i32, i32 }
|
||||
define_impl! { i32x4, i32, 4, x0, x1, x2, x3 }
|
||||
define_impl! { i32x4, i32, 4, i32x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
|
||||
define_impl! { u16x8, u16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
define_impl! { u16x8, u16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_ty! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 }
|
||||
define_impl! { i16x8, i16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
define_impl! { i16x8, i16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_ty! {
|
||||
u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
|
||||
}
|
||||
define_impl! {
|
||||
u8x16, u8, 16,
|
||||
u8x16, u8, 16, i8x16,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
|
||||
}
|
||||
|
||||
|
|
@ -36,7 +36,7 @@ define_ty! {
|
|||
i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
|
||||
}
|
||||
define_impl! {
|
||||
i8x16, i8, 16,
|
||||
i8x16, i8, 16, i8x16,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
|
||||
}
|
||||
|
||||
|
|
@ -61,3 +61,22 @@ define_integer_ops!(
|
|||
(i16x8, i16),
|
||||
(u8x16, u8),
|
||||
(i8x16, i8));
|
||||
define_casts!(
|
||||
(f64x2, f32x2, as_f32x2),
|
||||
(f64x2, u64x2, as_u64x2),
|
||||
(f64x2, i64x2, as_i64x2),
|
||||
(f32x4, f64x4, as_f64x4),
|
||||
(f32x4, u32x4, as_u32x4),
|
||||
(f32x4, i32x4, as_i32x4),
|
||||
(u64x2, f64x2, as_f64x2),
|
||||
(u64x2, i64x2, as_i64x2),
|
||||
(i64x2, f64x2, as_f64x2),
|
||||
(i64x2, u64x2, as_u64x2),
|
||||
(u32x4, f32x4, as_f32x4),
|
||||
(u32x4, i32x4, as_i32x4),
|
||||
(i32x4, f32x4, as_f32x4),
|
||||
(i32x4, u32x4, as_u32x4),
|
||||
(u16x8, i16x8, as_i16x8),
|
||||
(i16x8, u16x8, as_u16x8),
|
||||
(u8x16, i8x16, as_i8x16),
|
||||
(i8x16, u8x16, as_u8x16));
|
||||
|
|
|
|||
105
library/stdarch/src/v256.rs
Normal file
105
library/stdarch/src/v256.rs
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
use simd::*;
|
||||
|
||||
define_ty! { f64x4, f64, f64, f64, f64 }
|
||||
define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { f32x8, f32, f32, f32, f32, f32, f32, f32, f32 }
|
||||
define_impl! { f32x8, f32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_ty! { u64x4, u64, u64, u64, u64 }
|
||||
define_impl! { u64x4, u64, 4, i64x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { i64x4, i64, i64, i64, i64 }
|
||||
define_impl! { i64x4, i64, 4, i64x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { u32x8, u32, u32, u32, u32, u32, u32, u32, u32 }
|
||||
define_impl! { u32x8, u32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_ty! { i32x8, i32, i32, i32, i32, i32, i32, i32, i32 }
|
||||
define_impl! { i32x8, i32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_ty! {
|
||||
u16x16,
|
||||
u16, u16, u16, u16, u16, u16, u16, u16,
|
||||
u16, u16, u16, u16, u16, u16, u16, u16
|
||||
}
|
||||
define_impl! {
|
||||
u16x16, u16, 16, i16x16,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7,
|
||||
x8, x9, x10, x11, x12, x13, x14, x15
|
||||
}
|
||||
|
||||
define_ty! {
|
||||
i16x16,
|
||||
i16, i16, i16, i16, i16, i16, i16, i16,
|
||||
i16, i16, i16, i16, i16, i16, i16, i16
|
||||
}
|
||||
define_impl! {
|
||||
i16x16, i16, 16, i16x16,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7,
|
||||
x8, x9, x10, x11, x12, x13, x14, x15
|
||||
}
|
||||
|
||||
define_ty! {
|
||||
u8x32,
|
||||
u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
|
||||
u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
|
||||
}
|
||||
define_impl! {
|
||||
u8x32, u8, 32, i8x32,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
|
||||
x16, x17, x18, x19, x20, x21, x22, x23,
|
||||
x24, x25, x26, x27, x28, x29, x30, x31
|
||||
}
|
||||
|
||||
define_ty! {
|
||||
i8x32,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
|
||||
i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
|
||||
}
|
||||
define_impl! {
|
||||
i8x32, i8, 32, i8x32,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
|
||||
x16, x17, x18, x19, x20, x21, x22, x23,
|
||||
x24, x25, x26, x27, x28, x29, x30, x31
|
||||
}
|
||||
|
||||
define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
|
||||
define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
|
||||
define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32);
|
||||
define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32);
|
||||
define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32);
|
||||
define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32);
|
||||
define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
|
||||
define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);
|
||||
|
||||
define_common_ops!(
|
||||
f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
|
||||
define_float_ops!(f64x4, f32x8);
|
||||
define_integer_ops!(
|
||||
(u64x4, u64),
|
||||
(i64x4, i64),
|
||||
(u32x8, u32),
|
||||
(i32x8, i32),
|
||||
(u16x16, u16),
|
||||
(i16x16, i16),
|
||||
(u8x32, u8),
|
||||
(i8x32, i8));
|
||||
define_casts!(
|
||||
(f64x4, f32x4, as_f32x4),
|
||||
(f64x4, u64x4, as_u64x4),
|
||||
(f64x4, i64x4, as_i64x4),
|
||||
(f32x8, u32x8, as_u32x8),
|
||||
(f32x8, i32x8, as_i32x8),
|
||||
(u64x4, f64x4, as_f64x4),
|
||||
(u64x4, i64x4, as_i64x4),
|
||||
(i64x4, f64x4, as_f64x4),
|
||||
(i64x4, u64x4, as_u64x4),
|
||||
(u32x8, f32x8, as_f32x8),
|
||||
(u32x8, i32x8, as_i32x8),
|
||||
(i32x8, f32x8, as_f32x8),
|
||||
(i32x8, u32x8, as_u32x8),
|
||||
(u16x16, i16x16, as_i16x16),
|
||||
(i16x16, u16x16, as_u16x16),
|
||||
(u8x32, i8x32, as_i8x32),
|
||||
(i8x32, u8x32, as_u8x32));
|
||||
|
|
@ -1,25 +1,25 @@
|
|||
use simd::*;
|
||||
|
||||
define_ty! { f32x2, f32, f32 }
|
||||
define_impl! { f32x2, f32, 2, x0, x1 }
|
||||
define_impl! { f32x2, f32, 2, i32x2, x0, x1 }
|
||||
|
||||
define_ty! { u32x2, u32, u32 }
|
||||
define_impl! { u32x2, u32, 2, x0, x1 }
|
||||
define_impl! { u32x2, u32, 2, i32x2, x0, x1 }
|
||||
|
||||
define_ty! { i32x2, i32, i32 }
|
||||
define_impl! { i32x2, i32, 2, x0, x1 }
|
||||
define_impl! { i32x2, i32, 2, i32x2, x0, x1 }
|
||||
|
||||
define_ty! { u16x4, u16, u16, u16, u16 }
|
||||
define_impl! { u16x4, u16, 4, x0, x1, x2, x3 }
|
||||
define_impl! { u16x4, u16, 4, i16x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { i16x4, i16, i16, i16, i16 }
|
||||
define_impl! { i16x4, i16, 4, x0, x1, x2, x3 }
|
||||
define_impl! { i16x4, i16, 4, i16x4, x0, x1, x2, x3 }
|
||||
|
||||
define_ty! { u8x8, u8, u8, u8, u8, u8, u8, u8, u8 }
|
||||
define_impl! { u8x8, u8, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
define_impl! { u8x8, u8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_ty! { i8x8, i8, i8, i8, i8, i8, i8, i8, i8 }
|
||||
define_impl! { i8x8, i8, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
define_impl! { i8x8, i8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
|
||||
|
||||
define_from!(u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
|
||||
define_from!(i32x2, u32x2, u16x4, i16x4, u8x8, i8x8);
|
||||
|
|
@ -37,3 +37,15 @@ define_integer_ops!(
|
|||
(i16x4, i16),
|
||||
(u8x8, u8),
|
||||
(i8x8, i8));
|
||||
define_casts!(
|
||||
(f32x2, f64x2, as_f64x2),
|
||||
(f32x2, u32x2, as_u32x2),
|
||||
(f32x2, i32x2, as_i32x2),
|
||||
(u32x2, f32x2, as_f32x2),
|
||||
(u32x2, i32x2, as_i32x2),
|
||||
(i32x2, f32x2, as_f32x2),
|
||||
(i32x2, u32x2, as_u32x2),
|
||||
(u16x4, i16x4, as_i16x4),
|
||||
(i16x4, u16x4, as_u16x4),
|
||||
(u8x8, i8x8, as_i8x8),
|
||||
(i8x8, u8x8, as_u8x8));
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// pub use self::sse::*;
|
||||
pub use self::sse::*;
|
||||
pub use self::sse2::*;
|
||||
pub use self::ssse3::*;
|
||||
pub use self::sse42::*;
|
||||
|
|
@ -6,7 +6,7 @@ pub use self::sse42::*;
|
|||
#[allow(non_camel_case_types)]
|
||||
pub type __m128i = ::v128::i8x16;
|
||||
|
||||
// mod sse;
|
||||
mod sse;
|
||||
mod sse2;
|
||||
mod ssse3;
|
||||
mod sse42;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
use v128::*;
|
||||
|
||||
/// Return the square root of packed single-precision (32-bit) floating-point
|
||||
/// elements in `a`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
|
||||
unsafe { sqrtps(a) }
|
||||
}
|
||||
|
||||
/// Return the approximate reciprocal of packed single-precision (32-bit)
|
||||
/// floating-point elements in `a`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
|
||||
unsafe { rcpps(a) }
|
||||
}
|
||||
|
||||
/// Return the approximate reciprocal square root of packed single-precision
|
||||
/// (32-bit) floating-point elements in `a`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
|
||||
unsafe { rsqrtps(a) }
|
||||
}
|
||||
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
|
||||
/// `b`, and return the corresponding minimum values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { minps(a, b) }
|
||||
}
|
||||
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
|
||||
/// `b`, and return the corresponding maximum values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { maxps(a, b) }
|
||||
}
|
||||
|
||||
/// Return a mask of the most significant bit of each element in `a`.
|
||||
///
|
||||
/// The mask is stored in the 4 least significant bits of the return value.
|
||||
/// All other bits are set to `0`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
pub fn _mm_movemask_ps(a: f32x4) -> i32 {
|
||||
unsafe { movmskps(a) }
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
#[link_name = "llvm.x86.sse.sqrt.ps"]
|
||||
fn sqrtps(a: f32x4) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse.rcp.ps"]
|
||||
fn rcpps(a: f32x4) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse.rsqrt.ps"]
|
||||
fn rsqrtps(a: f32x4) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse.min.ps"]
|
||||
fn minps(a: f32x4, b: f32x4) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse.max.ps"]
|
||||
fn maxps(a: f32x4, b: f32x4) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse.movmsk.ps"]
|
||||
fn movmskps(a: f32x4) -> i32;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use v128::*;
|
||||
use x86::sse;
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_sqrt_ps() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_sqrt_ps(a);
|
||||
let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_rcp_ps() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_rcp_ps(a);
|
||||
let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_rsqrt_ps() {
|
||||
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
|
||||
let r = sse::_mm_rsqrt_ps(a);
|
||||
let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_min_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_min_ps(a, b);
|
||||
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_max_ps() {
|
||||
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
|
||||
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
|
||||
let r = sse::_mm_max_ps(a, b);
|
||||
assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_movemask_ps() {
|
||||
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
|
||||
assert_eq!(r, 0b0101);
|
||||
|
||||
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
|
||||
assert_eq!(r, 0b0111);
|
||||
}
|
||||
}
|
||||
|
|
@ -2,7 +2,9 @@ use std::mem;
|
|||
use std::os::raw::c_void;
|
||||
use std::ptr;
|
||||
|
||||
use simd::*;
|
||||
use simd::{
|
||||
simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
|
||||
};
|
||||
use x86::__m128i;
|
||||
use v128::*;
|
||||
use v64::*;
|
||||
|
|
@ -519,63 +521,63 @@ pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
|
||||
unsafe { simd_eq(a, b) }
|
||||
a.eq(b)
|
||||
}
|
||||
|
||||
/// Compare packed 16-bit integers in `a` and `b` for equality.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
|
||||
unsafe { simd_eq(a, b) }
|
||||
a.eq(b)
|
||||
}
|
||||
|
||||
/// Compare packed 32-bit integers in `a` and `b` for equality.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
|
||||
unsafe { simd_eq(a, b) }
|
||||
a.eq(b)
|
||||
}
|
||||
|
||||
/// Compare packed 8-bit integers in `a` and `b` for greater-than.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
|
||||
unsafe { simd_gt(a, b) }
|
||||
a.gt(b)
|
||||
}
|
||||
|
||||
/// Compare packed 16-bit integers in `a` and `b` for greater-than.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
|
||||
unsafe { simd_gt(a, b) }
|
||||
a.gt(b)
|
||||
}
|
||||
|
||||
/// Compare packed 32-bit integers in `a` and `b` for greater-than.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
|
||||
unsafe { simd_gt(a, b) }
|
||||
a.gt(b)
|
||||
}
|
||||
|
||||
/// Compare packed 8-bit integers in `a` and `b` for less-than.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
|
||||
unsafe { simd_lt(a, b) }
|
||||
a.lt(b)
|
||||
}
|
||||
|
||||
/// Compare packed 16-bit integers in `a` and `b` for less-than.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
|
||||
unsafe { simd_lt(a, b) }
|
||||
a.lt(b)
|
||||
}
|
||||
|
||||
/// Compare packed 32-bit integers in `a` and `b` for less-than.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
|
||||
unsafe { simd_lt(a, b) }
|
||||
a.lt(b)
|
||||
}
|
||||
|
||||
/// Convert the lower two packed 32-bit integers in `a` to packed
|
||||
|
|
@ -591,7 +593,7 @@ pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
|
||||
a.insert(0, b as f64)
|
||||
a.replace(0, b as f64)
|
||||
}
|
||||
|
||||
/// Return `a` with its lower element replaced by `b` after converting it to
|
||||
|
|
@ -599,7 +601,7 @@ pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
|
||||
a.insert(0, b as f64)
|
||||
a.replace(0, b as f64)
|
||||
}
|
||||
|
||||
/// Return `a` with its lower element replaced by `b` after converting it to
|
||||
|
|
@ -842,7 +844,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_move_epi64(a: i64x2) -> i64x2 {
|
||||
a.insert(1, 0)
|
||||
a.replace(1, 0)
|
||||
}
|
||||
|
||||
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
|
||||
|
|
@ -880,7 +882,7 @@ pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
|
||||
a.insert(imm8 as u32 & 0b111, i as i16)
|
||||
a.replace(imm8 as u32 & 0b111, i as i16)
|
||||
}
|
||||
|
||||
/// Return a mask of the most significant bit of each element in `a`.
|
||||
|
|
@ -1134,7 +1136,7 @@ pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
a.insert(0, a.extract(0) + b.extract(0))
|
||||
a.replace(0, a.extract(0) + b.extract(0))
|
||||
}
|
||||
|
||||
/// Add packed double-precision (64-bit) floating-point elements in `a` and
|
||||
|
|
@ -1150,7 +1152,7 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
a.insert(0, a.extract(0) / b.extract(0))
|
||||
a.replace(0, a.extract(0) / b.extract(0))
|
||||
}
|
||||
|
||||
/// Divide packed double-precision (64-bit) floating-point elements in `a` by
|
||||
|
|
@ -1198,7 +1200,7 @@ pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
a.insert(0, a.extract(0) * b.extract(0))
|
||||
a.replace(0, a.extract(0) * b.extract(0))
|
||||
}
|
||||
|
||||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||||
|
|
@ -1214,7 +1216,7 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
a.insert(0, unsafe { sqrtsd(b).extract(0) })
|
||||
a.replace(0, unsafe { sqrtsd(b).extract(0) })
|
||||
}
|
||||
|
||||
/// Return a new vector with the square root of each of the values in `a`.
|
||||
|
|
@ -1229,7 +1231,7 @@ pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
a.insert(0, a.extract(0) - b.extract(0))
|
||||
a.replace(0, a.extract(0) - b.extract(0))
|
||||
}
|
||||
|
||||
/// Subtract packed double-precision (64-bit) floating-point elements in `b`
|
||||
|
|
@ -1314,7 +1316,7 @@ pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
_mm_cmplt_sd(b, a).insert(1, a.extract(1))
|
||||
_mm_cmplt_sd(b, a).replace(1, a.extract(1))
|
||||
}
|
||||
|
||||
/// Return a new vector with the low element of `a` replaced by the
|
||||
|
|
@ -1322,7 +1324,7 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
_mm_cmple_sd(b, a).insert(1, a.extract(1))
|
||||
_mm_cmple_sd(b, a).replace(1, a.extract(1))
|
||||
}
|
||||
|
||||
/// Return a new vector with the low element of `a` replaced by the result
|
||||
|
|
@ -1373,7 +1375,7 @@ pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
_mm_cmpnlt_sd(b, a).insert(1, a.extract(1))
|
||||
_mm_cmpnlt_sd(b, a).replace(1, a.extract(1))
|
||||
}
|
||||
|
||||
/// Return a new vector with the low element of `a` replaced by the
|
||||
|
|
@ -1381,7 +1383,7 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
|
||||
_mm_cmpnle_sd(b, a).insert(1, a.extract(1))
|
||||
_mm_cmpnle_sd(b, a).replace(1, a.extract(1))
|
||||
}
|
||||
|
||||
/// Compare corresponding elements in `a` and `b` for equality.
|
||||
|
|
@ -1553,8 +1555,15 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
|
|||
unsafe { mem::transmute(ucomineqsd(a, b) as u8) }
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Return a mask of the most significant bit of each element in `a`.
|
||||
///
|
||||
/// The mask is stored in the 2 least significant bits of the return value.
|
||||
/// All other bits are set to `0`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub fn _mm_movemask_pd(a: f64x2) -> i32 {
|
||||
unsafe { movmskpd(a) }
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -1703,6 +1712,8 @@ extern {
|
|||
fn ucomigesd(a: f64x2, b: f64x2) -> i32;
|
||||
#[link_name = "llvm.x86.sse2.ucomineq.sd"]
|
||||
fn ucomineqsd(a: f64x2, b: f64x2) -> i32;
|
||||
#[link_name = "llvm.x86.sse2.movmsk.pd"]
|
||||
fn movmskpd(a: f64x2) -> i32;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -2306,7 +2317,7 @@ mod tests {
|
|||
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0);
|
||||
let r = sse2::_mm_cmpeq_epi16(a, b);
|
||||
assert_eq!(r, i16x8::splat(0).insert(2, 0xFFFFu16 as i16));
|
||||
assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2314,55 +2325,55 @@ mod tests {
|
|||
let a = i32x4::new(0, 1, 2, 3);
|
||||
let b = i32x4::new(3, 2, 2, 0);
|
||||
let r = sse2::_mm_cmpeq_epi32(a, b);
|
||||
assert_eq!(r, i32x4::splat(0).insert(2, 0xFFFFFFFFu32 as i32));
|
||||
assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_cmpgt_epi8() {
|
||||
let a = i8x16::splat(0).insert(0, 5);
|
||||
let a = i8x16::splat(0).replace(0, 5);
|
||||
let b = i8x16::splat(0);
|
||||
let r = sse2::_mm_cmpgt_epi8(a, b);
|
||||
assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8));
|
||||
assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_cmpgt_epi16() {
|
||||
let a = i16x8::splat(0).insert(0, 5);
|
||||
let a = i16x8::splat(0).replace(0, 5);
|
||||
let b = i16x8::splat(0);
|
||||
let r = sse2::_mm_cmpgt_epi16(a, b);
|
||||
assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16));
|
||||
assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_cmpgt_epi32() {
|
||||
let a = i32x4::splat(0).insert(0, 5);
|
||||
let a = i32x4::splat(0).replace(0, 5);
|
||||
let b = i32x4::splat(0);
|
||||
let r = sse2::_mm_cmpgt_epi32(a, b);
|
||||
assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32));
|
||||
assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_cmplt_epi8() {
|
||||
let a = i8x16::splat(0);
|
||||
let b = i8x16::splat(0).insert(0, 5);
|
||||
let b = i8x16::splat(0).replace(0, 5);
|
||||
let r = sse2::_mm_cmplt_epi8(a, b);
|
||||
assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8));
|
||||
assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_cmplt_epi16() {
|
||||
let a = i16x8::splat(0);
|
||||
let b = i16x8::splat(0).insert(0, 5);
|
||||
let b = i16x8::splat(0).replace(0, 5);
|
||||
let r = sse2::_mm_cmplt_epi16(a, b);
|
||||
assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16));
|
||||
assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_cmplt_epi32() {
|
||||
let a = i32x4::splat(0);
|
||||
let b = i32x4::splat(0).insert(0, 5);
|
||||
let b = i32x4::splat(0).replace(0, 5);
|
||||
let r = sse2::_mm_cmplt_epi32(a, b);
|
||||
assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32));
|
||||
assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2504,12 +2515,12 @@ mod tests {
|
|||
#[test]
|
||||
fn _mm_maskmoveu_si128() {
|
||||
let a = i8x16::splat(9);
|
||||
let mask = i8x16::splat(0).insert(2, 0x80u8 as i8);
|
||||
let mask = i8x16::splat(0).replace(2, 0x80u8 as i8);
|
||||
let mut r = i8x16::splat(0);
|
||||
unsafe {
|
||||
sse2::_mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8);
|
||||
}
|
||||
assert_eq!(r, i8x16::splat(0).insert(2, 9));
|
||||
assert_eq!(r, i8x16::splat(0).replace(2, 9));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2586,7 +2597,7 @@ mod tests {
|
|||
#[test]
|
||||
fn _mm_insert_epi16() {
|
||||
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.insert(0, 9));
|
||||
assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3207,4 +3218,13 @@ mod tests {
|
|||
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
|
||||
assert!(!sse2::_mm_ucomineq_sd(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _mm_movemask_pd() {
|
||||
let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0));
|
||||
assert_eq!(r, 0b01);
|
||||
|
||||
let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0));
|
||||
assert_eq!(r, 0b11);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue