diff --git a/library/stdarch/TODO.md b/library/stdarch/TODO.md index d97f0d4b6e4d..42ef6e43bb18 100644 --- a/library/stdarch/TODO.md +++ b/library/stdarch/TODO.md @@ -1,3 +1,167 @@ +**TIP**: Use the following command to generate a section in this list for +Intel intrinsics. Replace `SSE4.2` with the intended type. + +``` +rg '^> TODO.md +``` + + +sse +--- +* [ ] `_MM_TRANSPOSE4_PS` +* [ ] `_mm_getcsr` +* [ ] `_mm_setcsr` +* [ ] `_MM_GET_EXCEPTION_STATE` +* [ ] `_MM_SET_EXCEPTION_STATE` +* [ ] `_MM_GET_EXCEPTION_MASK` +* [ ] `_MM_SET_EXCEPTION_MASK` +* [ ] `_MM_GET_ROUNDING_MODE` +* [ ] `_MM_SET_ROUNDING_MODE` +* [ ] `_MM_GET_FLUSH_ZERO_MODE` +* [ ] `_MM_SET_FLUSH_ZERO_MODE` +* [ ] `_mm_prefetch` +* [ ] `_mm_sfence` +* [ ] `_mm_max_pi16` +* [ ] `_m_pmaxsw` +* [ ] `_mm_max_pu8` +* [ ] `_m_pmaxub` +* [ ] `_mm_min_pi16` +* [ ] `_m_pminsw` +* [ ] `_mm_min_pu8` +* [ ] `_m_pminub` +* [ ] `_mm_mulhi_pu16` +* [ ] `_m_pmulhuw` +* [ ] `_mm_avg_pu8` +* [ ] `_m_pavgb` +* [ ] `_mm_avg_pu16` +* [ ] `_m_pavgw` +* [ ] `_mm_sad_pu8` +* [ ] `_m_psadbw` +* [ ] `_mm_cvtsi32_ss` +* [ ] `_mm_cvt_si2ss` +* [ ] `_mm_cvtsi64_ss` +* [ ] `_mm_cvtpi32_ps` +* [ ] `_mm_cvt_pi2ps` +* [ ] `_mm_cvtpi16_ps` +* [ ] `_mm_cvtpu16_ps` +* [ ] `_mm_cvtpi8_ps` +* [ ] `_mm_cvtpu8_ps` +* [ ] `_mm_cvtpi32x2_ps` +* [ ] `_mm_stream_pi` +* [ ] `_mm_maskmove_si64` +* [ ] `_m_maskmovq` +* [ ] `_mm_extract_pi16` +* [ ] `_m_pextrw` +* [ ] `_mm_insert_pi16` +* [ ] `_m_pinsrw` +* [ ] `_mm_movemask_pi8` +* [ ] `_m_pmovmskb` +* [ ] `_mm_shuffle_pi16` +* [ ] `_m_pshufw` +* [ ] `_mm_add_ss` +* [ ] `_mm_add_ps` +* [ ] `_mm_sub_ss` +* [ ] `_mm_sub_ps` +* [ ] `_mm_mul_ss` +* [ ] `_mm_mul_ps` +* [ ] `_mm_div_ss` +* [ ] `_mm_div_ps` +* [ ] `_mm_sqrt_ss` +* [x] `_mm_sqrt_ps` +* [ ] `_mm_rcp_ss` +* [x] `_mm_rcp_ps` +* [ ] `_mm_rsqrt_ss` +* [x] `_mm_rsqrt_ps` +* [ ] `_mm_min_ss` +* [x] `_mm_min_ps` +* [ ] `_mm_max_ss` +* [x] `_mm_max_ps` +* [ ] `_mm_and_ps` +* [ ] `_mm_andnot_ps` +* [ ] `_mm_or_ps` +* [ ] `_mm_xor_ps` +* [ ] `_mm_cmpeq_ss` +* [ ] `_mm_cmpeq_ps` +* [ ] `_mm_cmplt_ss` +* [ ] `_mm_cmplt_ps` +* [ ] `_mm_cmple_ss` +* [ ] `_mm_cmple_ps` +* [ ] `_mm_cmpgt_ss` +* [ ] `_mm_cmpgt_ps` +* [ ] `_mm_cmpge_ss` +* [ ] `_mm_cmpge_ps` +* [ ] `_mm_cmpneq_ss` +* [ ] `_mm_cmpneq_ps` +* [ ] `_mm_cmpnlt_ss` +* [ ] `_mm_cmpnlt_ps` +* [ ] `_mm_cmpnle_ss` +* [ ] `_mm_cmpnle_ps` +* [ ] `_mm_cmpngt_ss` +* [ ] `_mm_cmpngt_ps` +* [ ] `_mm_cmpnge_ss` +* [ ] `_mm_cmpnge_ps` +* [ ] `_mm_cmpord_ss` +* [ ] `_mm_cmpord_ps` +* [ ] `_mm_cmpunord_ss` +* [ ] `_mm_cmpunord_ps` +* [ ] `_mm_comieq_ss` +* [ ] `_mm_comilt_ss` +* [ ] `_mm_comile_ss` +* [ ] `_mm_comigt_ss` +* [ ] `_mm_comige_ss` +* [ ] `_mm_comineq_ss` +* [ ] `_mm_ucomieq_ss` +* [ ] `_mm_ucomilt_ss` +* [ ] `_mm_ucomile_ss` +* [ ] `_mm_ucomigt_ss` +* [ ] `_mm_ucomige_ss` +* [ ] `_mm_ucomineq_ss` +* [ ] `_mm_cvtss_si32` +* [ ] `_mm_cvt_ss2si` +* [ ] `_mm_cvtss_si64` +* [ ] `_mm_cvtss_f32` +* [ ] `_mm_cvtps_pi32` +* [ ] `_mm_cvt_ps2pi` +* [ ] `_mm_cvttss_si32` +* [ ] `_mm_cvtt_ss2si` +* [ ] `_mm_cvttss_si64` +* [ ] `_mm_cvttps_pi32` +* [ ] `_mm_cvtt_ps2pi` +* [ ] `_mm_cvtps_pi16` +* [ ] `_mm_cvtps_pi8` +* [ ] `_mm_set_ss` +* [ ] `_mm_set1_ps` +* [ ] `_mm_set_ps1` +* [ ] `_mm_set_ps` +* [ ] `_mm_setr_ps` +* [ ] `_mm_setzero_ps` +* [ ] `_mm_loadh_pi` +* [ ] `_mm_loadl_pi` +* [ ] `_mm_load_ss` +* [ ] `_mm_load1_ps` +* [ ] `_mm_load_ps1` +* [ ] `_mm_load_ps` +* [ ] `_mm_loadu_ps` +* [ ] `_mm_loadr_ps` +* [ ] `_mm_stream_ps` +* [ ] `_mm_storeh_pi` +* [ ] `_mm_storel_pi` +* [ ] `_mm_store_ss` +* [ ] `_mm_store1_ps` +* [ ] `_mm_store_ps1` +* [ ] `_mm_store_ps` +* [ ] `_mm_storeu_ps` +* [ ] `_mm_storer_ps` +* [ ] `_mm_move_ss` +* [ ] `_mm_shuffle_ps` +* [ ] `_mm_unpackhi_ps` +* [ ] `_mm_unpacklo_ps` +* [ ] `_mm_movehl_ps` +* [ ] `_mm_movelh_ps` +* [x] `_mm_movemask_ps` +* [ ] `_mm_undefined_ps` + + sse2 ---- * [x] `_mm_pause` @@ -221,7 +385,7 @@ sse2 * [ ] `_mm_storel_pd` * [ ] `_mm_unpackhi_pd` * [ ] `_mm_unpacklo_pd` -* [ ] `_mm_movemask_pd` +* [x] `_mm_movemask_pd` * [ ] `_mm_shuffle_pd` * [ ] `_mm_move_sd` * [ ] `_mm_castpd_ps` @@ -234,6 +398,21 @@ sse2 * [ ] `_mm_undefined_si128` +sse3 +---- +* [ ] `_mm_addsub_ps` +* [ ] `_mm_addsub_pd` +* [ ] `_mm_hadd_pd` +* [ ] `_mm_hadd_ps` +* [ ] `_mm_hsub_pd` +* [ ] `_mm_hsub_ps` +* [ ] `_mm_lddqu_si128` +* [ ] `_mm_movedup_pd` +* [ ] `_mm_loaddup_pd` +* [ ] `_mm_movehdup_ps` +* [ ] `_mm_moveldup_ps` + + ssse3 ----- * [ ] `_mm_abs_pi8` @@ -268,3 +447,91 @@ ssse3 * [ ] `_mm_sign_pi8` * [ ] `_mm_sign_pi16` * [ ] `_mm_sign_pi32` + + +sse4.1 +------ +* [ ] `_mm_blend_pd` +* [ ] `_mm_blend_ps` +* [ ] `_mm_blendv_pd` +* [ ] `_mm_blendv_ps` +* [ ] `_mm_blendv_epi8` +* [ ] `_mm_blend_epi16` +* [ ] `_mm_dp_pd` +* [ ] `_mm_dp_ps` +* [ ] `_mm_extract_ps` +* [ ] `_mm_extract_epi8` +* [ ] `_mm_extract_epi32` +* [ ] `_mm_extract_epi64` +* [ ] `_mm_insert_ps` +* [ ] `_mm_insert_epi8` +* [ ] `_mm_insert_epi32` +* [ ] `_mm_insert_epi64` +* [ ] `_mm_max_epi8` +* [ ] `_mm_max_epi32` +* [ ] `_mm_max_epu32` +* [ ] `_mm_max_epu16` +* [ ] `_mm_min_epi8` +* [ ] `_mm_min_epi32` +* [ ] `_mm_min_epu32` +* [ ] `_mm_min_epu16` +* [ ] `_mm_packus_epi32` +* [ ] `_mm_cmpeq_epi64` +* [ ] `_mm_cvtepi8_epi16` +* [ ] `_mm_cvtepi8_epi32` +* [ ] `_mm_cvtepi8_epi64` +* [ ] `_mm_cvtepi16_epi32` +* [ ] `_mm_cvtepi16_epi64` +* [ ] `_mm_cvtepi32_epi64` +* [ ] `_mm_cvtepu8_epi16` +* [ ] `_mm_cvtepu8_epi32` +* [ ] `_mm_cvtepu8_epi64` +* [ ] `_mm_cvtepu16_epi32` +* [ ] `_mm_cvtepu16_epi64` +* [ ] `_mm_cvtepu32_epi64` +* [ ] `_mm_mul_epi32` +* [ ] `_mm_mullo_epi32` +* [ ] `_mm_testz_si128` +* [ ] `_mm_testc_si128` +* [ ] `_mm_testnzc_si128` +* [ ] `_mm_test_all_zeros` +* [ ] `_mm_test_mix_ones_zeros` +* [ ] `_mm_test_all_ones` +* [ ] `_mm_round_pd` +* [ ] `_mm_floor_pd` +* [ ] `_mm_ceil_pd` +* [ ] `_mm_round_ps` +* [ ] `_mm_floor_ps` +* [ ] `_mm_ceil_ps` +* [ ] `_mm_round_sd` +* [ ] `_mm_floor_sd` +* [ ] `_mm_ceil_sd` +* [ ] `_mm_round_ss` +* [ ] `_mm_floor_ss` +* [ ] `_mm_ceil_ss` +* [ ] `_mm_minpos_epu16` +* [ ] `_mm_mpsadbw_epu8` +* [ ] `_mm_stream_load_si128` + + +sse4.2 +------ +* [ ] `_mm_cmpistrm` +* [ ] `_mm_cmpistri` +* [ ] `_mm_cmpistrz` +* [ ] `_mm_cmpistrc` +* [ ] `_mm_cmpistrs` +* [ ] `_mm_cmpistro` +* [ ] `_mm_cmpistra` +* [ ] `_mm_cmpestrm` +* [ ] `_mm_cmpestri` +* [ ] `_mm_cmpestrz` +* [ ] `_mm_cmpestrc` +* [ ] `_mm_cmpestrs` +* [ ] `_mm_cmpestro` +* [ ] `_mm_cmpestra` +* [ ] `_mm_cmpgt_epi64` +* [ ] `_mm_crc32_u8` +* [ ] `_mm_crc32_u16` +* [ ] `_mm_crc32_u32` +* [ ] `_mm_crc32_u64` diff --git a/library/stdarch/src/lib.rs b/library/stdarch/src/lib.rs index 429601a37170..a3c770d5492c 100644 --- a/library/stdarch/src/lib.rs +++ b/library/stdarch/src/lib.rs @@ -5,6 +5,7 @@ )] pub use v128::*; +pub use v256::*; pub use v64::*; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use x86::*; @@ -13,6 +14,7 @@ pub use x86::*; mod macros; mod simd; mod v128; +mod v256; mod v64; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod x86; diff --git a/library/stdarch/src/macros.rs b/library/stdarch/src/macros.rs index b3cf8ecf4e85..4a8bddb012e7 100644 --- a/library/stdarch/src/macros.rs +++ b/library/stdarch/src/macros.rs @@ -8,8 +8,10 @@ macro_rules! define_ty { } macro_rules! define_impl { - ($name:ident, $elemty:ident, $nelems:expr, - $($elname:ident),+) => { + ( + $name:ident, $elemty:ident, $nelems:expr, $boolname:ident, + $($elname:ident),+ + ) => { impl $name { #[inline] pub fn new($($elname: $elemty),*) -> $name { @@ -25,25 +27,46 @@ macro_rules! define_impl { }),*) } - #[inline(always)] + #[inline] pub fn extract(self, idx: u32) -> $elemty { assert!(idx < $nelems); unsafe { simd_extract(self, idx) } } - #[inline(always)] - pub fn insert(self, idx: u32, val: $elemty) -> $name { + #[inline] + pub fn replace(self, idx: u32, val: $elemty) -> $name { assert!(idx < $nelems); unsafe { simd_insert(self, idx, val) } } - #[inline(always)] + #[inline] + pub fn store(self, slice: &mut [$elemty], offset: usize) { + assert!(slice[offset..].len() >= $nelems); + unsafe { self.store_unchecked(slice, offset) } + } + + #[inline] + pub unsafe fn store_unchecked( + self, + slice: &mut [$elemty], + offset: usize, + ) { + use std::mem::size_of; + use std::ptr; + + ptr::copy_nonoverlapping( + &self as *const $name as *const u8, + slice.get_unchecked_mut(offset) as *mut $elemty as *mut u8, + size_of::<$name>()); + } + + #[inline] pub fn load(slice: &[$elemty], offset: usize) -> $name { assert!(slice[offset..].len() >= $nelems); unsafe { $name::load_unchecked(slice, offset) } } - #[inline(always)] + #[inline] pub unsafe fn load_unchecked( slice: &[$elemty], offset: usize, @@ -58,6 +81,36 @@ macro_rules! define_impl { size_of::<$name>()); x } + + #[inline] + pub fn eq(self, other: $name) -> $boolname { + unsafe { simd_eq(self, other) } + } + + #[inline] + pub fn ne(self, other: $name) -> $boolname { + unsafe { simd_ne(self, other) } + } + + #[inline] + pub fn lt(self, other: $name) -> $boolname { + unsafe { simd_lt(self, other) } + } + + #[inline] + pub fn le(self, other: $name) -> $boolname { + unsafe { simd_le(self, other) } + } + + #[inline] + pub fn gt(self, other: $name) -> $boolname { + unsafe { simd_gt(self, other) } + } + + #[inline] + pub fn ge(self, other: $name) -> $boolname { + unsafe { simd_ge(self, other) } + } } } } @@ -177,3 +230,15 @@ macro_rules! define_integer_ops { )+ } } + +macro_rules! define_casts { + ($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => { + $( + impl $ty { + pub fn $floatcast(self) -> ::$floatty { + unsafe { simd_cast(self) } + } + } + )+ + } +} diff --git a/library/stdarch/src/v128.rs b/library/stdarch/src/v128.rs index 2e0625663bc3..4a8bf358da6a 100644 --- a/library/stdarch/src/v128.rs +++ b/library/stdarch/src/v128.rs @@ -1,34 +1,34 @@ use simd::*; define_ty! { f64x2, f64, f64 } -define_impl! { f64x2, f64, 2, x0, x1 } +define_impl! { f64x2, f64, 2, i64x2, x0, x1 } define_ty! { f32x4, f32, f32, f32, f32 } -define_impl! { f32x4, f32, 4, x0, x1, x2, x3 } +define_impl! { f32x4, f32, 4, i32x4, x0, x1, x2, x3 } define_ty! { u64x2, u64, u64 } -define_impl! { u64x2, u64, 2, x0, x1 } +define_impl! { u64x2, u64, 2, i64x2, x0, x1 } define_ty! { i64x2, i64, i64 } -define_impl! { i64x2, i64, 2, x0, x1 } +define_impl! { i64x2, i64, 2, i64x2, x0, x1 } define_ty! { u32x4, u32, u32, u32, u32 } -define_impl! { u32x4, u32, 4, x0, x1, x2, x3 } +define_impl! { u32x4, u32, 4, i32x4, x0, x1, x2, x3 } define_ty! { i32x4, i32, i32, i32, i32 } -define_impl! { i32x4, i32, 4, x0, x1, x2, x3 } +define_impl! { i32x4, i32, 4, i32x4, x0, x1, x2, x3 } define_ty! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 } -define_impl! { u16x8, u16, 8, x0, x1, x2, x3, x4, x5, x6, x7 } +define_impl! { u16x8, u16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 } define_ty! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 } -define_impl! { i16x8, i16, 8, x0, x1, x2, x3, x4, x5, x6, x7 } +define_impl! { i16x8, i16, 8, i16x8, x0, x1, x2, x3, x4, x5, x6, x7 } define_ty! { u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 } define_impl! { - u8x16, u8, 16, + u8x16, u8, 16, i8x16, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 } @@ -36,7 +36,7 @@ define_ty! { i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } define_impl! { - i8x16, i8, 16, + i8x16, i8, 16, i8x16, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 } @@ -61,3 +61,22 @@ define_integer_ops!( (i16x8, i16), (u8x16, u8), (i8x16, i8)); +define_casts!( + (f64x2, f32x2, as_f32x2), + (f64x2, u64x2, as_u64x2), + (f64x2, i64x2, as_i64x2), + (f32x4, f64x4, as_f64x4), + (f32x4, u32x4, as_u32x4), + (f32x4, i32x4, as_i32x4), + (u64x2, f64x2, as_f64x2), + (u64x2, i64x2, as_i64x2), + (i64x2, f64x2, as_f64x2), + (i64x2, u64x2, as_u64x2), + (u32x4, f32x4, as_f32x4), + (u32x4, i32x4, as_i32x4), + (i32x4, f32x4, as_f32x4), + (i32x4, u32x4, as_u32x4), + (u16x8, i16x8, as_i16x8), + (i16x8, u16x8, as_u16x8), + (u8x16, i8x16, as_i8x16), + (i8x16, u8x16, as_u8x16)); diff --git a/library/stdarch/src/v256.rs b/library/stdarch/src/v256.rs new file mode 100644 index 000000000000..ba09e70ebdee --- /dev/null +++ b/library/stdarch/src/v256.rs @@ -0,0 +1,105 @@ +use simd::*; + +define_ty! { f64x4, f64, f64, f64, f64 } +define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 } + +define_ty! { f32x8, f32, f32, f32, f32, f32, f32, f32, f32 } +define_impl! { f32x8, f32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 } + +define_ty! { u64x4, u64, u64, u64, u64 } +define_impl! { u64x4, u64, 4, i64x4, x0, x1, x2, x3 } + +define_ty! { i64x4, i64, i64, i64, i64 } +define_impl! { i64x4, i64, 4, i64x4, x0, x1, x2, x3 } + +define_ty! { u32x8, u32, u32, u32, u32, u32, u32, u32, u32 } +define_impl! { u32x8, u32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 } + +define_ty! { i32x8, i32, i32, i32, i32, i32, i32, i32, i32 } +define_impl! { i32x8, i32, 8, i32x8, x0, x1, x2, x3, x4, x5, x6, x7 } + +define_ty! { + u16x16, + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16 +} +define_impl! { + u16x16, u16, 16, i16x16, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 +} + +define_ty! { + i16x16, + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16 +} +define_impl! { + i16x16, i16, 16, i16x16, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 +} + +define_ty! { + u8x32, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 +} +define_impl! { + u8x32, u8, 32, i8x32, + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +} + +define_ty! { + i8x32, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 +} +define_impl! { + i8x32, i8, 32, i8x32, + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +} + +define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32); +define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32); +define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32); +define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32); + +define_common_ops!( + f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); +define_float_ops!(f64x4, f32x8); +define_integer_ops!( + (u64x4, u64), + (i64x4, i64), + (u32x8, u32), + (i32x8, i32), + (u16x16, u16), + (i16x16, i16), + (u8x32, u8), + (i8x32, i8)); +define_casts!( + (f64x4, f32x4, as_f32x4), + (f64x4, u64x4, as_u64x4), + (f64x4, i64x4, as_i64x4), + (f32x8, u32x8, as_u32x8), + (f32x8, i32x8, as_i32x8), + (u64x4, f64x4, as_f64x4), + (u64x4, i64x4, as_i64x4), + (i64x4, f64x4, as_f64x4), + (i64x4, u64x4, as_u64x4), + (u32x8, f32x8, as_f32x8), + (u32x8, i32x8, as_i32x8), + (i32x8, f32x8, as_f32x8), + (i32x8, u32x8, as_u32x8), + (u16x16, i16x16, as_i16x16), + (i16x16, u16x16, as_u16x16), + (u8x32, i8x32, as_i8x32), + (i8x32, u8x32, as_u8x32)); diff --git a/library/stdarch/src/v64.rs b/library/stdarch/src/v64.rs index 2e197665d931..de2c14c356a8 100644 --- a/library/stdarch/src/v64.rs +++ b/library/stdarch/src/v64.rs @@ -1,25 +1,25 @@ use simd::*; define_ty! { f32x2, f32, f32 } -define_impl! { f32x2, f32, 2, x0, x1 } +define_impl! { f32x2, f32, 2, i32x2, x0, x1 } define_ty! { u32x2, u32, u32 } -define_impl! { u32x2, u32, 2, x0, x1 } +define_impl! { u32x2, u32, 2, i32x2, x0, x1 } define_ty! { i32x2, i32, i32 } -define_impl! { i32x2, i32, 2, x0, x1 } +define_impl! { i32x2, i32, 2, i32x2, x0, x1 } define_ty! { u16x4, u16, u16, u16, u16 } -define_impl! { u16x4, u16, 4, x0, x1, x2, x3 } +define_impl! { u16x4, u16, 4, i16x4, x0, x1, x2, x3 } define_ty! { i16x4, i16, i16, i16, i16 } -define_impl! { i16x4, i16, 4, x0, x1, x2, x3 } +define_impl! { i16x4, i16, 4, i16x4, x0, x1, x2, x3 } define_ty! { u8x8, u8, u8, u8, u8, u8, u8, u8, u8 } -define_impl! { u8x8, u8, 8, x0, x1, x2, x3, x4, x5, x6, x7 } +define_impl! { u8x8, u8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 } define_ty! { i8x8, i8, i8, i8, i8, i8, i8, i8, i8 } -define_impl! { i8x8, i8, 8, x0, x1, x2, x3, x4, x5, x6, x7 } +define_impl! { i8x8, i8, 8, i8x8, x0, x1, x2, x3, x4, x5, x6, x7 } define_from!(u32x2, i32x2, u16x4, i16x4, u8x8, i8x8); define_from!(i32x2, u32x2, u16x4, i16x4, u8x8, i8x8); @@ -37,3 +37,15 @@ define_integer_ops!( (i16x4, i16), (u8x8, u8), (i8x8, i8)); +define_casts!( + (f32x2, f64x2, as_f64x2), + (f32x2, u32x2, as_u32x2), + (f32x2, i32x2, as_i32x2), + (u32x2, f32x2, as_f32x2), + (u32x2, i32x2, as_i32x2), + (i32x2, f32x2, as_f32x2), + (i32x2, u32x2, as_u32x2), + (u16x4, i16x4, as_i16x4), + (i16x4, u16x4, as_u16x4), + (u8x8, i8x8, as_i8x8), + (i8x8, u8x8, as_u8x8)); diff --git a/library/stdarch/src/x86/mod.rs b/library/stdarch/src/x86/mod.rs index c968cc76a538..610bf657d0be 100644 --- a/library/stdarch/src/x86/mod.rs +++ b/library/stdarch/src/x86/mod.rs @@ -1,4 +1,4 @@ -// pub use self::sse::*; +pub use self::sse::*; pub use self::sse2::*; pub use self::ssse3::*; pub use self::sse42::*; @@ -6,7 +6,7 @@ pub use self::sse42::*; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; -// mod sse; +mod sse; mod sse2; mod ssse3; mod sse42; diff --git a/library/stdarch/src/x86/sse.rs b/library/stdarch/src/x86/sse.rs index e69de29bb2d1..c03735624c75 100644 --- a/library/stdarch/src/x86/sse.rs +++ b/library/stdarch/src/x86/sse.rs @@ -0,0 +1,128 @@ +use v128::*; + +/// Return the square root of packed single-precision (32-bit) floating-point +/// elements in `a`. +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 { + unsafe { sqrtps(a) } +} + +/// Return the approximate reciprocal of packed single-precision (32-bit) +/// floating-point elements in `a`. +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_rcp_ps(a: f32x4) -> f32x4 { + unsafe { rcpps(a) } +} + +/// Return the approximate reciprocal square root of packed single-precision +/// (32-bit) floating-point elements in `a`. +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 { + unsafe { rsqrtps(a) } +} + +/// Compare packed single-precision (32-bit) floating-point elements in `a` and +/// `b`, and return the corresponding minimum values. +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 { + unsafe { minps(a, b) } +} + +/// Compare packed single-precision (32-bit) floating-point elements in `a` and +/// `b`, and return the corresponding maximum values. +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 { + unsafe { maxps(a, b) } +} + +/// Return a mask of the most significant bit of each element in `a`. +/// +/// The mask is stored in the 4 least significant bits of the return value. +/// All other bits are set to `0`. +#[inline(always)] +#[target_feature = "+sse"] +pub fn _mm_movemask_ps(a: f32x4) -> i32 { + unsafe { movmskps(a) } +} + +#[allow(improper_ctypes)] +extern { + #[link_name = "llvm.x86.sse.sqrt.ps"] + fn sqrtps(a: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse.rcp.ps"] + fn rcpps(a: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse.rsqrt.ps"] + fn rsqrtps(a: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse.min.ps"] + fn minps(a: f32x4, b: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse.max.ps"] + fn maxps(a: f32x4, b: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse.movmsk.ps"] + fn movmskps(a: f32x4) -> i32; +} + +#[cfg(test)] +mod tests { + use v128::*; + use x86::sse; + + #[test] + #[target_feature = "+sse"] + fn _mm_sqrt_ps() { + let a = f32x4::new(4.0, 13.0, 16.0, 100.0); + let r = sse::_mm_sqrt_ps(a); + let e = f32x4::new(2.0, 3.6055512, 4.0, 10.0); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+sse"] + fn _mm_rcp_ps() { + let a = f32x4::new(4.0, 13.0, 16.0, 100.0); + let r = sse::_mm_rcp_ps(a); + let e = f32x4::new(0.24993896, 0.0769043, 0.06248474, 0.0099983215); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+sse"] + fn _mm_rsqrt_ps() { + let a = f32x4::new(4.0, 13.0, 16.0, 100.0); + let r = sse::_mm_rsqrt_ps(a); + let e = f32x4::new(0.49987793, 0.2772827, 0.24993896, 0.099990845); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+sse"] + fn _mm_min_ps() { + let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); + let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); + let r = sse::_mm_min_ps(a, b); + assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0)); + } + + #[test] + #[target_feature = "+sse"] + fn _mm_max_ps() { + let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); + let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); + let r = sse::_mm_max_ps(a, b); + assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0)); + } + + #[test] + #[target_feature = "+sse"] + fn _mm_movemask_ps() { + let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0)); + assert_eq!(r, 0b0101); + + let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0)); + assert_eq!(r, 0b0111); + } +} diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs index bb7e414a6bb9..1ecaf9ea43aa 100644 --- a/library/stdarch/src/x86/sse2.rs +++ b/library/stdarch/src/x86/sse2.rs @@ -2,7 +2,9 @@ use std::mem; use std::os::raw::c_void; use std::ptr; -use simd::*; +use simd::{ + simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16, +}; use x86::__m128i; use v128::*; use v64::*; @@ -519,63 +521,63 @@ pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpeq_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { simd_eq(a, b) } + a.eq(b) } /// Compare packed 16-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpeq_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { simd_eq(a, b) } + a.eq(b) } /// Compare packed 32-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpeq_epi32(a: i32x4, b: i32x4) -> i32x4 { - unsafe { simd_eq(a, b) } + a.eq(b) } /// Compare packed 8-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpgt_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { simd_gt(a, b) } + a.gt(b) } /// Compare packed 16-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpgt_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { simd_gt(a, b) } + a.gt(b) } /// Compare packed 32-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpgt_epi32(a: i32x4, b: i32x4) -> i32x4 { - unsafe { simd_gt(a, b) } + a.gt(b) } /// Compare packed 8-bit integers in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmplt_epi8(a: i8x16, b: i8x16) -> i8x16 { - unsafe { simd_lt(a, b) } + a.lt(b) } /// Compare packed 16-bit integers in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmplt_epi16(a: i16x8, b: i16x8) -> i16x8 { - unsafe { simd_lt(a, b) } + a.lt(b) } /// Compare packed 32-bit integers in `a` and `b` for less-than. #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 { - unsafe { simd_lt(a, b) } + a.lt(b) } /// Convert the lower two packed 32-bit integers in `a` to packed @@ -591,7 +593,7 @@ pub fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 { - a.insert(0, b as f64) + a.replace(0, b as f64) } /// Return `a` with its lower element replaced by `b` after converting it to @@ -599,7 +601,7 @@ pub fn _mm_cvtsi32_sd(a: f64x2, b: i32) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cvtsi64_sd(a: f64x2, b: i64) -> f64x2 { - a.insert(0, b as f64) + a.replace(0, b as f64) } /// Return `a` with its lower element replaced by `b` after converting it to @@ -842,7 +844,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_move_epi64(a: i64x2) -> i64x2 { - a.insert(1, 0) + a.replace(1, 0) } /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers @@ -880,7 +882,7 @@ pub fn _mm_extract_epi16(a: i16x8, imm8: i32) -> i32 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_insert_epi16(a: i16x8, i: i32, imm8: i32) -> i16x8 { - a.insert(imm8 as u32 & 0b111, i as i16) + a.replace(imm8 as u32 & 0b111, i as i16) } /// Return a mask of the most significant bit of each element in `a`. @@ -1134,7 +1136,7 @@ pub fn _mm_unpacklo_epi64(a: i64x2, b: i64x2) -> i64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_add_sd(a: f64x2, b: f64x2) -> f64x2 { - a.insert(0, a.extract(0) + b.extract(0)) + a.replace(0, a.extract(0) + b.extract(0)) } /// Add packed double-precision (64-bit) floating-point elements in `a` and @@ -1150,7 +1152,7 @@ pub fn _mm_add_pd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_div_sd(a: f64x2, b: f64x2) -> f64x2 { - a.insert(0, a.extract(0) / b.extract(0)) + a.replace(0, a.extract(0) / b.extract(0)) } /// Divide packed double-precision (64-bit) floating-point elements in `a` by @@ -1198,7 +1200,7 @@ pub fn _mm_min_pd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_mul_sd(a: f64x2, b: f64x2) -> f64x2 { - a.insert(0, a.extract(0) * b.extract(0)) + a.replace(0, a.extract(0) * b.extract(0)) } /// Multiply packed double-precision (64-bit) floating-point elements in `a` @@ -1214,7 +1216,7 @@ pub fn _mm_mul_pd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_sqrt_sd(a: f64x2, b: f64x2) -> f64x2 { - a.insert(0, unsafe { sqrtsd(b).extract(0) }) + a.replace(0, unsafe { sqrtsd(b).extract(0) }) } /// Return a new vector with the square root of each of the values in `a`. @@ -1229,7 +1231,7 @@ pub fn _mm_sqrt_pd(a: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_sub_sd(a: f64x2, b: f64x2) -> f64x2 { - a.insert(0, a.extract(0) - b.extract(0)) + a.replace(0, a.extract(0) - b.extract(0)) } /// Subtract packed double-precision (64-bit) floating-point elements in `b` @@ -1314,7 +1316,7 @@ pub fn _mm_cmple_sd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 { - _mm_cmplt_sd(b, a).insert(1, a.extract(1)) + _mm_cmplt_sd(b, a).replace(1, a.extract(1)) } /// Return a new vector with the low element of `a` replaced by the @@ -1322,7 +1324,7 @@ pub fn _mm_cmpgt_sd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpge_sd(a: f64x2, b: f64x2) -> f64x2 { - _mm_cmple_sd(b, a).insert(1, a.extract(1)) + _mm_cmple_sd(b, a).replace(1, a.extract(1)) } /// Return a new vector with the low element of `a` replaced by the result @@ -1373,7 +1375,7 @@ pub fn _mm_cmpnle_sd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 { - _mm_cmpnlt_sd(b, a).insert(1, a.extract(1)) + _mm_cmpnlt_sd(b, a).replace(1, a.extract(1)) } /// Return a new vector with the low element of `a` replaced by the @@ -1381,7 +1383,7 @@ pub fn _mm_cmpngt_sd(a: f64x2, b: f64x2) -> f64x2 { #[inline(always)] #[target_feature = "+sse2"] pub fn _mm_cmpnge_sd(a: f64x2, b: f64x2) -> f64x2 { - _mm_cmpnle_sd(b, a).insert(1, a.extract(1)) + _mm_cmpnle_sd(b, a).replace(1, a.extract(1)) } /// Compare corresponding elements in `a` and `b` for equality. @@ -1553,8 +1555,15 @@ pub fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool { unsafe { mem::transmute(ucomineqsd(a, b) as u8) } } - - +/// Return a mask of the most significant bit of each element in `a`. +/// +/// The mask is stored in the 2 least significant bits of the return value. +/// All other bits are set to `0`. +#[inline(always)] +#[target_feature = "+sse2"] +pub fn _mm_movemask_pd(a: f64x2) -> i32 { + unsafe { movmskpd(a) } +} @@ -1703,6 +1712,8 @@ extern { fn ucomigesd(a: f64x2, b: f64x2) -> i32; #[link_name = "llvm.x86.sse2.ucomineq.sd"] fn ucomineqsd(a: f64x2, b: f64x2) -> i32; + #[link_name = "llvm.x86.sse2.movmsk.pd"] + fn movmskpd(a: f64x2) -> i32; } #[cfg(test)] @@ -2306,7 +2317,7 @@ mod tests { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); let b = i16x8::new(7, 6, 2, 4, 3, 2, 1, 0); let r = sse2::_mm_cmpeq_epi16(a, b); - assert_eq!(r, i16x8::splat(0).insert(2, 0xFFFFu16 as i16)); + assert_eq!(r, i16x8::splat(0).replace(2, 0xFFFFu16 as i16)); } #[test] @@ -2314,55 +2325,55 @@ mod tests { let a = i32x4::new(0, 1, 2, 3); let b = i32x4::new(3, 2, 2, 0); let r = sse2::_mm_cmpeq_epi32(a, b); - assert_eq!(r, i32x4::splat(0).insert(2, 0xFFFFFFFFu32 as i32)); + assert_eq!(r, i32x4::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); } #[test] fn _mm_cmpgt_epi8() { - let a = i8x16::splat(0).insert(0, 5); + let a = i8x16::splat(0).replace(0, 5); let b = i8x16::splat(0); let r = sse2::_mm_cmpgt_epi8(a, b); - assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8)); + assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8)); } #[test] fn _mm_cmpgt_epi16() { - let a = i16x8::splat(0).insert(0, 5); + let a = i16x8::splat(0).replace(0, 5); let b = i16x8::splat(0); let r = sse2::_mm_cmpgt_epi16(a, b); - assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16)); + assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16)); } #[test] fn _mm_cmpgt_epi32() { - let a = i32x4::splat(0).insert(0, 5); + let a = i32x4::splat(0).replace(0, 5); let b = i32x4::splat(0); let r = sse2::_mm_cmpgt_epi32(a, b); - assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32)); + assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); } #[test] fn _mm_cmplt_epi8() { let a = i8x16::splat(0); - let b = i8x16::splat(0).insert(0, 5); + let b = i8x16::splat(0).replace(0, 5); let r = sse2::_mm_cmplt_epi8(a, b); - assert_eq!(r, i8x16::splat(0).insert(0, 0xFFu8 as i8)); + assert_eq!(r, i8x16::splat(0).replace(0, 0xFFu8 as i8)); } #[test] fn _mm_cmplt_epi16() { let a = i16x8::splat(0); - let b = i16x8::splat(0).insert(0, 5); + let b = i16x8::splat(0).replace(0, 5); let r = sse2::_mm_cmplt_epi16(a, b); - assert_eq!(r, i16x8::splat(0).insert(0, 0xFFFFu16 as i16)); + assert_eq!(r, i16x8::splat(0).replace(0, 0xFFFFu16 as i16)); } #[test] fn _mm_cmplt_epi32() { let a = i32x4::splat(0); - let b = i32x4::splat(0).insert(0, 5); + let b = i32x4::splat(0).replace(0, 5); let r = sse2::_mm_cmplt_epi32(a, b); - assert_eq!(r, i32x4::splat(0).insert(0, 0xFFFFFFFFu32 as i32)); + assert_eq!(r, i32x4::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); } #[test] @@ -2504,12 +2515,12 @@ mod tests { #[test] fn _mm_maskmoveu_si128() { let a = i8x16::splat(9); - let mask = i8x16::splat(0).insert(2, 0x80u8 as i8); + let mask = i8x16::splat(0).replace(2, 0x80u8 as i8); let mut r = i8x16::splat(0); unsafe { sse2::_mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8); } - assert_eq!(r, i8x16::splat(0).insert(2, 9)); + assert_eq!(r, i8x16::splat(0).replace(2, 9)); } #[test] @@ -2586,7 +2597,7 @@ mod tests { #[test] fn _mm_insert_epi16() { let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.insert(0, 9)); + assert_eq!(sse2::_mm_insert_epi16(a, 9, 0), a.replace(0, 9)); } #[test] @@ -3207,4 +3218,13 @@ mod tests { let (a, b) = (f64x2::new(1.0, 2.0), f64x2::new(1.0, 3.0)); assert!(!sse2::_mm_ucomineq_sd(a, b)); } + + #[test] + fn _mm_movemask_pd() { + let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, 5.0)); + assert_eq!(r, 0b01); + + let r = sse2::_mm_movemask_pd(f64x2::new(-1.0, -5.0)); + assert_eq!(r, 0b11); + } }