Add 256-bit vectors and some SSE intrinsics.

This commit is contained in:
Andrew Gallant 2017-03-18 15:21:32 -04:00
parent df03dc4d80
commit 27e307278a
9 changed files with 687 additions and 69 deletions

View file

@ -1,3 +1,167 @@
**TIP**: Use the following command to generate a section in this list for
Intel intrinsics. Replace `SSE4.2` with the intended type.
```
rg '^<intrinsic' intel-intrinsics-3.3.15.xml | rg "'SSE4.2'" | rg '^.*name=\x27([^\x27]+)\x27.*$' -r '* [ ] `$1`' >> TODO.md
```
sse
---
* [ ] `_MM_TRANSPOSE4_PS`
* [ ] `_mm_getcsr`
* [ ] `_mm_setcsr`
* [ ] `_MM_GET_EXCEPTION_STATE`
* [ ] `_MM_SET_EXCEPTION_STATE`
* [ ] `_MM_GET_EXCEPTION_MASK`
* [ ] `_MM_SET_EXCEPTION_MASK`
* [ ] `_MM_GET_ROUNDING_MODE`
* [ ] `_MM_SET_ROUNDING_MODE`
* [ ] `_MM_GET_FLUSH_ZERO_MODE`
* [ ] `_MM_SET_FLUSH_ZERO_MODE`
* [ ] `_mm_prefetch`
* [ ] `_mm_sfence`
* [ ] `_mm_max_pi16`
* [ ] `_m_pmaxsw`
* [ ] `_mm_max_pu8`
* [ ] `_m_pmaxub`
* [ ] `_mm_min_pi16`
* [ ] `_m_pminsw`
* [ ] `_mm_min_pu8`
* [ ] `_m_pminub`
* [ ] `_mm_mulhi_pu16`
* [ ] `_m_pmulhuw`
* [ ] `_mm_avg_pu8`
* [ ] `_m_pavgb`
* [ ] `_mm_avg_pu16`
* [ ] `_m_pavgw`
* [ ] `_mm_sad_pu8`
* [ ] `_m_psadbw`
* [ ] `_mm_cvtsi32_ss`
* [ ] `_mm_cvt_si2ss`
* [ ] `_mm_cvtsi64_ss`
* [ ] `_mm_cvtpi32_ps`
* [ ] `_mm_cvt_pi2ps`
* [ ] `_mm_cvtpi16_ps`
* [ ] `_mm_cvtpu16_ps`
* [ ] `_mm_cvtpi8_ps`
* [ ] `_mm_cvtpu8_ps`
* [ ] `_mm_cvtpi32x2_ps`
* [ ] `_mm_stream_pi`
* [ ] `_mm_maskmove_si64`
* [ ] `_m_maskmovq`
* [ ] `_mm_extract_pi16`
* [ ] `_m_pextrw`
* [ ] `_mm_insert_pi16`
* [ ] `_m_pinsrw`
* [ ] `_mm_movemask_pi8`
* [ ] `_m_pmovmskb`
* [ ] `_mm_shuffle_pi16`
* [ ] `_m_pshufw`
* [ ] `_mm_add_ss`
* [ ] `_mm_add_ps`
* [ ] `_mm_sub_ss`
* [ ] `_mm_sub_ps`
* [ ] `_mm_mul_ss`
* [ ] `_mm_mul_ps`
* [ ] `_mm_div_ss`
* [ ] `_mm_div_ps`
* [ ] `_mm_sqrt_ss`
* [x] `_mm_sqrt_ps`
* [ ] `_mm_rcp_ss`
* [x] `_mm_rcp_ps`
* [ ] `_mm_rsqrt_ss`
* [x] `_mm_rsqrt_ps`
* [ ] `_mm_min_ss`
* [x] `_mm_min_ps`
* [ ] `_mm_max_ss`
* [x] `_mm_max_ps`
* [ ] `_mm_and_ps`
* [ ] `_mm_andnot_ps`
* [ ] `_mm_or_ps`
* [ ] `_mm_xor_ps`
* [ ] `_mm_cmpeq_ss`
* [ ] `_mm_cmpeq_ps`
* [ ] `_mm_cmplt_ss`
* [ ] `_mm_cmplt_ps`
* [ ] `_mm_cmple_ss`
* [ ] `_mm_cmple_ps`
* [ ] `_mm_cmpgt_ss`
* [ ] `_mm_cmpgt_ps`
* [ ] `_mm_cmpge_ss`
* [ ] `_mm_cmpge_ps`
* [ ] `_mm_cmpneq_ss`
* [ ] `_mm_cmpneq_ps`
* [ ] `_mm_cmpnlt_ss`
* [ ] `_mm_cmpnlt_ps`
* [ ] `_mm_cmpnle_ss`
* [ ] `_mm_cmpnle_ps`
* [ ] `_mm_cmpngt_ss`
* [ ] `_mm_cmpngt_ps`
* [ ] `_mm_cmpnge_ss`
* [ ] `_mm_cmpnge_ps`
* [ ] `_mm_cmpord_ss`
* [ ] `_mm_cmpord_ps`
* [ ] `_mm_cmpunord_ss`
* [ ] `_mm_cmpunord_ps`
* [ ] `_mm_comieq_ss`
* [ ] `_mm_comilt_ss`
* [ ] `_mm_comile_ss`
* [ ] `_mm_comigt_ss`
* [ ] `_mm_comige_ss`
* [ ] `_mm_comineq_ss`
* [ ] `_mm_ucomieq_ss`
* [ ] `_mm_ucomilt_ss`
* [ ] `_mm_ucomile_ss`
* [ ] `_mm_ucomigt_ss`
* [ ] `_mm_ucomige_ss`
* [ ] `_mm_ucomineq_ss`
* [ ] `_mm_cvtss_si32`
* [ ] `_mm_cvt_ss2si`
* [ ] `_mm_cvtss_si64`
* [ ] `_mm_cvtss_f32`
* [ ] `_mm_cvtps_pi32`
* [ ] `_mm_cvt_ps2pi`
* [ ] `_mm_cvttss_si32`
* [ ] `_mm_cvtt_ss2si`
* [ ] `_mm_cvttss_si64`
* [ ] `_mm_cvttps_pi32`
* [ ] `_mm_cvtt_ps2pi`
* [ ] `_mm_cvtps_pi16`
* [ ] `_mm_cvtps_pi8`
* [ ] `_mm_set_ss`
* [ ] `_mm_set1_ps`
* [ ] `_mm_set_ps1`
* [ ] `_mm_set_ps`
* [ ] `_mm_setr_ps`
* [ ] `_mm_setzero_ps`
* [ ] `_mm_loadh_pi`
* [ ] `_mm_loadl_pi`
* [ ] `_mm_load_ss`
* [ ] `_mm_load1_ps`
* [ ] `_mm_load_ps1`
* [ ] `_mm_load_ps`
* [ ] `_mm_loadu_ps`
* [ ] `_mm_loadr_ps`
* [ ] `_mm_stream_ps`
* [ ] `_mm_storeh_pi`
* [ ] `_mm_storel_pi`
* [ ] `_mm_store_ss`
* [ ] `_mm_store1_ps`
* [ ] `_mm_store_ps1`
* [ ] `_mm_store_ps`
* [ ] `_mm_storeu_ps`
* [ ] `_mm_storer_ps`
* [ ] `_mm_move_ss`
* [ ] `_mm_shuffle_ps`
* [ ] `_mm_unpackhi_ps`
* [ ] `_mm_unpacklo_ps`
* [ ] `_mm_movehl_ps`
* [ ] `_mm_movelh_ps`
* [x] `_mm_movemask_ps`
* [ ] `_mm_undefined_ps`
sse2
----
* [x] `_mm_pause`
@ -221,7 +385,7 @@ sse2
* [ ] `_mm_storel_pd`
* [ ] `_mm_unpackhi_pd`
* [ ] `_mm_unpacklo_pd`
* [ ] `_mm_movemask_pd`
* [x] `_mm_movemask_pd`
* [ ] `_mm_shuffle_pd`
* [ ] `_mm_move_sd`
* [ ] `_mm_castpd_ps`
@ -234,6 +398,21 @@ sse2
* [ ] `_mm_undefined_si128`
sse3
----
* [ ] `_mm_addsub_ps`
* [ ] `_mm_addsub_pd`
* [ ] `_mm_hadd_pd`
* [ ] `_mm_hadd_ps`
* [ ] `_mm_hsub_pd`
* [ ] `_mm_hsub_ps`
* [ ] `_mm_lddqu_si128`
* [ ] `_mm_movedup_pd`
* [ ] `_mm_loaddup_pd`
* [ ] `_mm_movehdup_ps`
* [ ] `_mm_moveldup_ps`
ssse3
-----
* [ ] `_mm_abs_pi8`
@ -268,3 +447,91 @@ ssse3
* [ ] `_mm_sign_pi8`
* [ ] `_mm_sign_pi16`
* [ ] `_mm_sign_pi32`
sse4.1
------
* [ ] `_mm_blend_pd`
* [ ] `_mm_blend_ps`
* [ ] `_mm_blendv_pd`
* [ ] `_mm_blendv_ps`
* [ ] `_mm_blendv_epi8`
* [ ] `_mm_blend_epi16`
* [ ] `_mm_dp_pd`
* [ ] `_mm_dp_ps`
* [ ] `_mm_extract_ps`
* [ ] `_mm_extract_epi8`
* [ ] `_mm_extract_epi32`
* [ ] `_mm_extract_epi64`
* [ ] `_mm_insert_ps`
* [ ] `_mm_insert_epi8`
* [ ] `_mm_insert_epi32`
* [ ] `_mm_insert_epi64`
* [ ] `_mm_max_epi8`
* [ ] `_mm_max_epi32`
* [ ] `_mm_max_epu32`
* [ ] `_mm_max_epu16`
* [ ] `_mm_min_epi8`
* [ ] `_mm_min_epi32`
* [ ] `_mm_min_epu32`
* [ ] `_mm_min_epu16`
* [ ] `_mm_packus_epi32`
* [ ] `_mm_cmpeq_epi64`
* [ ] `_mm_cvtepi8_epi16`
* [ ] `_mm_cvtepi8_epi32`
* [ ] `_mm_cvtepi8_epi64`
* [ ] `_mm_cvtepi16_epi32`
* [ ] `_mm_cvtepi16_epi64`
* [ ] `_mm_cvtepi32_epi64`
* [ ] `_mm_cvtepu8_epi16`
* [ ] `_mm_cvtepu8_epi32`
* [ ] `_mm_cvtepu8_epi64`
* [ ] `_mm_cvtepu16_epi32`
* [ ] `_mm_cvtepu16_epi64`
* [ ] `_mm_cvtepu32_epi64`
* [ ] `_mm_mul_epi32`
* [ ] `_mm_mullo_epi32`
* [ ] `_mm_testz_si128`
* [ ] `_mm_testc_si128`
* [ ] `_mm_testnzc_si128`
* [ ] `_mm_test_all_zeros`
* [ ] `_mm_test_mix_ones_zeros`
* [ ] `_mm_test_all_ones`
* [ ] `_mm_round_pd`
* [ ] `_mm_floor_pd`
* [ ] `_mm_ceil_pd`
* [ ] `_mm_round_ps`
* [ ] `_mm_floor_ps`
* [ ] `_mm_ceil_ps`
* [ ] `_mm_round_sd`
* [ ] `_mm_floor_sd`
* [ ] `_mm_ceil_sd`
* [ ] `_mm_round_ss`
* [ ] `_mm_floor_ss`
* [ ] `_mm_ceil_ss`
* [ ] `_mm_minpos_epu16`
* [ ] `_mm_mpsadbw_epu8`
* [ ] `_mm_stream_load_si128`
sse4.2
------
* [ ] `_mm_cmpistrm`
* [ ] `_mm_cmpistri`
* [ ] `_mm_cmpistrz`
* [ ] `_mm_cmpistrc`
* [ ] `_mm_cmpistrs`
* [ ] `_mm_cmpistro`
* [ ] `_mm_cmpistra`
* [ ] `_mm_cmpestrm`
* [ ] `_mm_cmpestri`
* [ ] `_mm_cmpestrz`
* [ ] `_mm_cmpestrc`
* [ ] `_mm_cmpestrs`
* [ ] `_mm_cmpestro`
* [ ] `_mm_cmpestra`
* [ ] `_mm_cmpgt_epi64`
* [ ] `_mm_crc32_u8`
* [ ] `_mm_crc32_u16`
* [ ] `_mm_crc32_u32`
* [ ] `_mm_crc32_u64`

View file

@ -5,6 +5,7 @@
)]
pub use v128::*;
pub use v256::*;
pub use v64::*;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub use x86::*;
@ -13,6 +14,7 @@ pub use x86::*;
mod macros;
mod simd;
mod v128;
mod v256;
mod v64;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mod x86;

View file

@ -8,8 +8,10 @@ macro_rules! define_ty {
}
macro_rules! define_impl {
($name:ident, $elemty:ident, $nelems:expr,
$($elname:ident),+) => {
(
$name:ident, $elemty:ident, $nelems:expr, $boolname:ident,
$($elname:ident),+
) => {
impl $name {
#[inline]
pub fn new($($elname: $elemty),*) -> $name {
@ -25,25 +27,46 @@ macro_rules! define_impl {
}),*)
}
#[inline(always)]
#[inline]
pub fn extract(self, idx: u32) -> $elemty {
assert!(idx < $nelems);
unsafe { simd_extract(self, idx) }
}
#[inline(always)]
pub fn insert(self, idx: u32, val: $elemty) -> $name {
#[inline]
pub fn replace(self, idx: u32, val: $elemty) -> $name {
assert!(idx < $nelems);
unsafe { simd_insert(self, idx, val) }
}
#[inline(always)]
#[inline]
pub fn store(self, slice: &mut [$elemty], offset: usize) {
assert!(slice[offset..].len() >= $nelems);
unsafe { self.store_unchecked(slice, offset) }
}
#[inline]
pub unsafe fn store_unchecked(
self,
slice: &mut [$elemty],
offset: usize,
) {
use std::mem::size_of;
use std::ptr;
ptr::copy_nonoverlapping(
&self as *const $name as *const u8,
slice.get_unchecked_mut(offset) as *mut $elemty as *mut u8,
size_of::<$name>());
}
#[inline]
pub fn load(slice: &[$elemty], offset: usize) -> $name {
assert!(slice[offset..].len() >= $nelems);
unsafe { $name::load_unchecked(slice, offset) }
}
#[inline(always)]
#[inline]
pub unsafe fn load_unchecked(
slice: &[$elemty],
offset: usize,
@ -58,6 +81,36 @@ macro_rules! define_impl {
size_of::<$name>());
x
}
#[inline]
pub fn eq(self, other: $name) -> $boolname {
unsafe { simd_eq(self, other) }
}
#[inline]
pub fn ne(self, other: $name) -> $boolname {
unsafe { simd_ne(self, other) }
}
#[inline]
pub fn lt(self, other: $name) -> $boolname {
unsafe { simd_lt(self, other) }
}
#[inline]
pub fn le(self, other: $name) -> $boolname {
unsafe { simd_le(self, other) }
}
#[inline]
pub fn gt(self, other: $name) -> $boolname {
unsafe { simd_gt(self, other) }
}
#[inline]
pub fn ge(self, other: $name) -> $boolname {
unsafe { simd_ge(self, other) }
}
}
}
}
@ -177,3 +230,15 @@ macro_rules! define_integer_ops {
)+
}
}
macro_rules! define_casts {
($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => {
$(
impl $ty {
pub fn $floatcast(self) -> ::$floatty {
unsafe { simd_cast(self) }
}
}
)+
}
}

View file

@ -1,34 +1,34 @@
use simd::*;
define_ty! { f64x2, f64, f64 }
define_impl! { f64x2, f64, 2, x0, x1 }
define_impl! { f64x2, f64, 2, i64x2, x0, x1 }
define_ty! { f32x4, f32, f32, f32, f32 }
define_impl! { f32x4, f32, 4, x0, x1, x2, x3 }
define_impl! { f32x4, f32, 4, i32x4, x0, x1, x2, x3 }
define_ty! { u64x2, u64, u64 }
define_impl! { u64x2, u64, 2, x0, x1 }
define_impl! { u64x2, u64, 2, i64x2, x0, x1 }
define_ty! { i64x2, i64, i64 }
define_impl! { i64x2, i64, 2, x0, x1 }
define_impl! { i64x2, i64, 2, i64x2, x0, x1 }
define_ty! { u32x4, u32, u32, u32, u32 }
define_impl! { u32x4, u32, 4, x0, x1, x2, x3 }
define_impl! { u32x4, u32, 4, i32x4, x0, x1, x2, x3 }
define_ty! { i32x4, i32, i32, i32, i32 }
define_impl! { i32x4, i32, 4, x0, x1, x2, x3 }
define_impl! { i32x4, i32, 4, i32x4, x0, x1, x2, x3 }
define_ty! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
define_impl! { u16x8, u16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_impl! { u16x8, u16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 }
define_impl! { i16x8, i16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_impl! { i16x8, i16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty! {
u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
}
define_impl! {
u8x16, u8, 16,
u8x16, u8, 16, i8x16,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
}
@ -36,7 +36,7 @@ define_ty! {
i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
}
define_impl! {
i8x16, i8, 16,
i8x16, i8, 16, i8x16,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
}
@ -61,3 +61,22 @@ define_integer_ops!(
(i16x8, i16),
(u8x16, u8),
(i8x16, i8));
define_casts!(
(f64x2, f32x2, as_f32x2),
(f64x2, u64x2, as_u64x2),
(f64x2, i64x2, as_i64x2),
(f32x4, f64x4, as_f64x4),
(f32x4, u32x4, as_u32x4),
(f32x4, i32x4, as_i32x4),
(u64x2, f64x2, as_f64x2),
(u64x2, i64x2, as_i64x2),
(i64x2, f64x2, as_f64x2),
(i64x2, u64x2, as_u64x2),
(u32x4, f32x4, as_f32x4),
(u32x4, i32x4, as_i32x4),
(i32x4, f32x4, as_f32x4),
(i32x4, u32x4, as_u32x4),
(u16x8, i16x8, as_i16x8),
(i16x8, u16x8, as_u16x8),
(u8x16, i8x16, as_i8x16),
(i8x16, u8x16, as_u8x16));

105
library/stdarch/src/v256.rs Normal file
View file

@ -0,0 +1,105 @@
use simd::*;
define_ty! { f64x4, f64, f64, f64, f64 }
define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 }
define_ty! { f32x8, f32, f32, f32, f32, f32, f32, f32, f32 }
define_impl! { f32x8, f32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty! { u64x4, u64, u64, u64, u64 }
define_impl! { u64x4, u64, 4, i64x4, x0, x1, x2, x3 }
define_ty! { i64x4, i64, i64, i64, i64 }
define_impl! { i64x4, i64, 4, i64x4, x0, x1, x2, x3 }
define_ty! { u32x8, u32, u32, u32, u32, u32, u32, u32, u32 }
define_impl! { u32x8, u32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty! { i32x8, i32, i32, i32, i32, i32, i32, i32, i32 }
define_impl! { i32x8, i32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty! {
u16x16,
u16, u16, u16, u16, u16, u16, u16, u16,
u16, u16, u16, u16, u16, u16, u16, u16
}
define_impl! {
u16x16, u16, 16, i16x16,
x0, x1, x2, x3, x4, x5, x6, x7,
x8, x9, x10, x11, x12, x13, x14, x15
}
define_ty! {
i16x16,
i16, i16, i16, i16, i16, i16, i16, i16,
i16, i16, i16, i16, i16, i16, i16, i16
}
define_impl! {
i16x16, i16, 16, i16x16,
x0, x1, x2, x3, x4, x5, x6, x7,
x8, x9, x10, x11, x12, x13, x14, x15
}
define_ty! {
u8x32,
u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
}
define_impl! {
u8x32, u8, 32, i8x32,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
x16, x17, x18, x19, x20, x21, x22, x23,
x24, x25, x26, x27, x28, x29, x30, x31
}
define_ty! {
i8x32,
i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
}
define_impl! {
i8x32, i8, 32, i8x32,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
x16, x17, x18, x19, x20, x21, x22, x23,
x24, x25, x26, x27, x28, x29, x30, x31
}
define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32);
define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32);
define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32);
define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32);
define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);
define_common_ops!(
f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
define_float_ops!(f64x4, f32x8);
define_integer_ops!(
(u64x4, u64),
(i64x4, i64),
(u32x8, u32),
(i32x8, i32),
(u16x16, u16),
(i16x16, i16),
(u8x32, u8),
(i8x32, i8));
define_casts!(
(f64x4, f32x4, as_f32x4),
(f64x4, u64x4, as_u64x4),
(f64x4, i64x4, as_i64x4),
(f32x8, u32x8, as_u32x8),
(f32x8, i32x8, as_i32x8),
(u64x4, f64x4, as_f64x4),
(u64x4, i64x4, as_i64x4),
(i64x4, f64x4, as_f64x4),
(i64x4, u64x4, as_u64x4),
(u32x8, f32x8, as_f32x8),
(u32x8, i32x8, as_i32x8),
(i32x8, f32x8, as_f32x8),
(i32x8, u32x8, as_u32x8),
(u16x16, i16x16, as_i16x16),
(i16x16, u16x16, as_u16x16),
(u8x32, i8x32, as_i8x32),
(i8x32, u8x32, as_u8x32));

View file

@ -1,25 +1,25 @@
use simd::*;
define_ty! { f32x2, f32, f32 }
define_impl! { f32x2, f32, 2, x0, x1 }
define_impl! { f32x2, f32, 2, i32x2, x0, x1 }
define_ty! { u32x2, u32, u32 }
define_impl! { u32x2, u32, 2, x0, x1 }
define_impl! { u32x2, u32, 2, i32x2, x0, x1 }
define_ty! { i32x2, i32, i32 }
define_impl! { i32x2, i32, 2, x0, x1 }
define_impl! { i32x2, i32, 2, i32x2, x0, x1 }
define_ty! { u16x4, u16, u16, u16, u16 }
define_impl! { u16x4, u16, 4, x0, x1, x2, x3 }
define_impl! { u16x4, u16, 4, i16x4, x0, x1, x2, x3 }
define_ty! { i16x4, i16, i16, i16, i16 }
define_impl! { i16x4, i16, 4, x0, x1, x2, x3 }
define_impl! { i16x4, i16, 4, i16x4, x0, x1, x2, x3 }
define_ty! { u8x8, u8, u8, u8, u8, u8, u8, u8, u8 }
define_impl! { u8x8, u8, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_impl! { u8x8, u8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty! { i8x8, i8, i8, i8, i8, i8, i8, i8, i8 }
define_impl! { i8x8, i8, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_impl! { i8x8, i8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_from!(u32x2, i32x2, u16x4, i16x4, u8x8, i8x8);
define_from!(i32x2, u32x2, u16x4, i16x4, u8x8, i8x8);
@ -37,3 +37,15 @@ define_integer_ops!(
(i16x4, i16),
(u8x8, u8),
(i8x8, i8));
define_casts!(
(f32x2, f64x2, as_f64x2),
(f32x2, u32x2, as_u32x2),
(f32x2, i32x2, as_i32x2),
(u32x2, f32x2, as_f32x2),
(u32x2, i32x2, as_i32x2),
(i32x2, f32x2, as_f32x2),
(i32x2, u32x2, as_u32x2),
(u16x4, i16x4, as_i16x4),
(i16x4, u16x4, as_u16x4),
(u8x8, i8x8, as_i8x8),
(i8x8, u8x8, as_u8x8));

View file

@ -1,4 +1,4 @@
// pub use self::sse::*;
pub use self::sse::*;
pub use self::sse2::*;
pub use self::ssse3::*;
pub use self::sse42::*;
@ -6,7 +6,7 @@ pub use self::sse42::*;
#[allow(non_camel_case_types)]
pub type __m128i = ::v128::i8x16;
// mod sse;
mod sse;
mod sse2;
mod ssse3;
mod sse42;

View file

@ -0,0 +1,128 @@
use v128::*;
/// Return the square root of packed single-precision (32-bit) floating-point
/// elements in `a`.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
unsafe { sqrtps(a) }
}
/// Return the approximate reciprocal of packed single-precision (32-bit)
/// floating-point elements in `a`.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
unsafe { rcpps(a) }
}
/// Return the approximate reciprocal square root of packed single-precision
/// (32-bit) floating-point elements in `a`.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
unsafe { rsqrtps(a) }
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
/// `b`, and return the corresponding minimum values.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { minps(a, b) }
}
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
/// `b`, and return the corresponding maximum values.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
unsafe { maxps(a, b) }
}
/// Return a mask of the most significant bit of each element in `a`.
///
/// The mask is stored in the 4 least significant bits of the return value.
/// All other bits are set to `0`.
#[inline(always)]
#[target_feature = "+sse"]
pub fn _mm_movemask_ps(a: f32x4) -> i32 {
unsafe { movmskps(a) }
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse.sqrt.ps"]
fn sqrtps(a: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse.rcp.ps"]
fn rcpps(a: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse.rsqrt.ps"]
fn rsqrtps(a: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse.min.ps"]
fn minps(a: f32x4, b: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse.max.ps"]
fn maxps(a: f32x4, b: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse.movmsk.ps"]
fn movmskps(a: f32x4) -> i32;
}
#[cfg(test)]
mod tests {
use v128::*;
use x86::sse;
#[test]
#[target_feature = "+sse"]
fn _mm_sqrt_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_sqrt_ps(a);
let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+sse"]
fn _mm_rcp_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rcp_ps(a);
let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+sse"]
fn _mm_rsqrt_ps() {
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
let r = sse::_mm_rsqrt_ps(a);
let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+sse"]
fn _mm_min_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_min_ps(a, b);
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
}
#[test]
#[target_feature = "+sse"]
fn _mm_max_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse::_mm_max_ps(a, b);
assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
}
#[test]
#[target_feature = "+sse"]
fn _mm_movemask_ps() {
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
assert_eq!(r, 0b0101);
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
assert_eq!(r, 0b0111);
}
}

View file

@ -2,7 +2,9 @@ use std::mem;
use std::os::raw::c_void;
use std::ptr;
use simd::*;
use simd::{
simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
};
use x86::__m128i;
use v128::*;
use v64::*;
@ -519,63 +521,63 @@ pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe { simd_eq(a, b) }
a.eq(b)
}
/// Compare packed 16-bit integers in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { simd_eq(a, b) }
a.eq(b)
}
/// Compare packed 32-bit integers in `a` and `b` for equality.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 {
unsafe { simd_eq(a, b) }
a.eq(b)
}
/// Compare packed 8-bit integers in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe { simd_gt(a, b) }
a.gt(b)
}
/// Compare packed 16-bit integers in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { simd_gt(a, b) }
a.gt(b)
}
/// Compare packed 32-bit integers in `a` and `b` for greater-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 {
unsafe { simd_gt(a, b) }
a.gt(b)
}
/// Compare packed 8-bit integers in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 {
unsafe { simd_lt(a, b) }
a.lt(b)
}
/// Compare packed 16-bit integers in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 {
unsafe { simd_lt(a, b) }
a.lt(b)
}
/// Compare packed 32-bit integers in `a` and `b` for less-than.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
unsafe { simd_lt(a, b) }
a.lt(b)
}
/// Convert the lower two packed 32-bit integers in `a` to packed
@ -591,7 +593,7 @@ pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
a.insert(0, b as f64)
a.replace(0, b as f64)
}
/// Return `a` with its lower element replaced by `b` after converting it to
@ -599,7 +601,7 @@ pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 {
a.insert(0, b as f64)
a.replace(0, b as f64)
}
/// Return `a` with its lower element replaced by `b` after converting it to
@ -842,7 +844,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_move_epi64(a: i64x2) -> i64x2 {
a.insert(1, 0)
a.replace(1, 0)
}
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
@ -880,7 +882,7 @@ pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 {
a.insert(imm8 as u32 & 0b111, i as i16)
a.replace(imm8 as u32 & 0b111, i as i16)
}
/// Return a mask of the most significant bit of each element in `a`.
@ -1134,7 +1136,7 @@ pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 {
a.insert(0, a.extract(0) + b.extract(0))
a.replace(0, a.extract(0) + b.extract(0))
}
/// Add packed double-precision (64-bit) floating-point elements in `a` and
@ -1150,7 +1152,7 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 {
a.insert(0, a.extract(0) / b.extract(0))
a.replace(0, a.extract(0) / b.extract(0))
}
/// Divide packed double-precision (64-bit) floating-point elements in `a` by
@ -1198,7 +1200,7 @@ pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 {
a.insert(0, a.extract(0) * b.extract(0))
a.replace(0, a.extract(0) * b.extract(0))
}
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
@ -1214,7 +1216,7 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 {
a.insert(0, unsafe { sqrtsd(b).extract(0) })
a.replace(0, unsafe { sqrtsd(b).extract(0) })
}
/// Return a new vector with the square root of each of the values in `a`.
@ -1229,7 +1231,7 @@ pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 {
a.insert(0, a.extract(0) - b.extract(0))
a.replace(0, a.extract(0) - b.extract(0))
}
/// Subtract packed double-precision (64-bit) floating-point elements in `b`
@ -1314,7 +1316,7 @@ pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmplt_sd(b, a).insert(1, a.extract(1))
_mm_cmplt_sd(b, a).replace(1, a.extract(1))
}
/// Return a new vector with the low element of `a` replaced by the
@ -1322,7 +1324,7 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmple_sd(b, a).insert(1, a.extract(1))
_mm_cmple_sd(b, a).replace(1, a.extract(1))
}
/// Return a new vector with the low element of `a` replaced by the result
@ -1373,7 +1375,7 @@ pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmpnlt_sd(b, a).insert(1, a.extract(1))
_mm_cmpnlt_sd(b, a).replace(1, a.extract(1))
}
/// Return a new vector with the low element of `a` replaced by the
@ -1381,7 +1383,7 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 {
_mm_cmpnle_sd(b, a).insert(1, a.extract(1))
_mm_cmpnle_sd(b, a).replace(1, a.extract(1))
}
/// Compare corresponding elements in `a` and `b` for equality.
@ -1553,8 +1555,15 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
unsafe { mem::transmute(ucomineqsd(a, b) as u8) }
}
/// Return a mask of the most significant bit of each element in `a`.
///
/// The mask is stored in the 2 least significant bits of the return value.
/// All other bits are set to `0`.
#[inline(always)]
#[target_feature = "+sse2"]
pub fn _mm_movemask_pd(a: f64x2) -> i32 {
unsafe { movmskpd(a) }
}
@ -1703,6 +1712,8 @@ extern {
fn ucomigesd(a: f64x2, b: f64x2) -> i32;
#[link_name = "llvm.x86.sse2.ucomineq.sd"]
fn ucomineqsd(a: f64x2, b: f64x2) -> i32;
#[link_name = "llvm.x86.sse2.movmsk.pd"]
fn movmskpd(a: f64x2) -> i32;
}
#[cfg(test)]
@ -2306,7 +2317,7 @@ mod tests {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0);
let r = sse2::_mm_cmpeq_epi16(a, b);
assert_eq!(r, i16x8::splat(0).insert(2, 0xFFFFu16 as i16));
assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16));
}
#[test]
@ -2314,55 +2325,55 @@ mod tests {
let a = i32x4::new(0, 1, 2, 3);
let b = i32x4::new(3, 2, 2, 0);
let r = sse2::_mm_cmpeq_epi32(a, b);
assert_eq!(r, i32x4::splat(0).insert(2, 0xFFFFFFFFu32 as i32));
assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
}
#[test]
fn _mm_cmpgt_epi8() {
let a = i8x16::splat(0).insert(0, 5);
let a = i8x16::splat(0).replace(0, 5);
let b = i8x16::splat(0);
let r = sse2::_mm_cmpgt_epi8(a, b);
assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8));
assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
}
#[test]
fn _mm_cmpgt_epi16() {
let a = i16x8::splat(0).insert(0, 5);
let a = i16x8::splat(0).replace(0, 5);
let b = i16x8::splat(0);
let r = sse2::_mm_cmpgt_epi16(a, b);
assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16));
assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
}
#[test]
fn _mm_cmpgt_epi32() {
let a = i32x4::splat(0).insert(0, 5);
let a = i32x4::splat(0).replace(0, 5);
let b = i32x4::splat(0);
let r = sse2::_mm_cmpgt_epi32(a, b);
assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32));
assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
}
#[test]
fn _mm_cmplt_epi8() {
let a = i8x16::splat(0);
let b = i8x16::splat(0).insert(0, 5);
let b = i8x16::splat(0).replace(0, 5);
let r = sse2::_mm_cmplt_epi8(a, b);
assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8));
assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8));
}
#[test]
fn _mm_cmplt_epi16() {
let a = i16x8::splat(0);
let b = i16x8::splat(0).insert(0, 5);
let b = i16x8::splat(0).replace(0, 5);
let r = sse2::_mm_cmplt_epi16(a, b);
assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16));
assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16));
}
#[test]
fn _mm_cmplt_epi32() {
let a = i32x4::splat(0);
let b = i32x4::splat(0).insert(0, 5);
let b = i32x4::splat(0).replace(0, 5);
let r = sse2::_mm_cmplt_epi32(a, b);
assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32));
assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32));
}
#[test]
@ -2504,12 +2515,12 @@ mod tests {
#[test]
fn _mm_maskmoveu_si128() {
let a = i8x16::splat(9);
let mask = i8x16::splat(0).insert(2, 0x80u8 as i8);
let mask = i8x16::splat(0).replace(2, 0x80u8 as i8);
let mut r = i8x16::splat(0);
unsafe {
sse2::_mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8);
}
assert_eq!(r, i8x16::splat(0).insert(2, 9));
assert_eq!(r, i8x16::splat(0).replace(2, 9));
}
#[test]
@ -2586,7 +2597,7 @@ mod tests {
#[test]
fn _mm_insert_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.insert(0, 9));
assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9));
}
#[test]
@ -3207,4 +3218,13 @@ mod tests {
let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0));
assert!(!sse2::_mm_ucomineq_sd(a, b));
}
#[test]
fn _mm_movemask_pd() {
let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0));
assert_eq!(r, 0b01);
let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0));
assert_eq!(r, 0b11);
}
}