diff --git a/library/stdarch/src/x86/sse3.rs b/library/stdarch/src/x86/sse3.rs index 32231e557e97..af7353359f94 100644 --- a/library/stdarch/src/x86/sse3.rs +++ b/library/stdarch/src/x86/sse3.rs @@ -8,7 +8,7 @@ use stdsimd_test::assert_instr; /// floating-point elements in `a` to/from packed elements in `b`. #[inline(always)] #[target_feature = "+sse3"] -#[cfg_attr(test, assert_instr(addsub))] +#[cfg_attr(test, assert_instr(addsubps))] pub unsafe fn _mm_addsub_ps(a: f32x4, b: f32x4) -> f32x4 { addsubps(a, b) } @@ -17,7 +17,7 @@ pub unsafe fn _mm_addsub_ps(a: f32x4, b: f32x4) -> f32x4 { /// floating-point elements in `a` to/from packed elements in `b`. #[inline(always)] #[target_feature = "+sse3"] -#[cfg_attr(test, assert_instr(addsub))] +#[cfg_attr(test, assert_instr(addsubpd))] pub unsafe fn _mm_addsub_pd(a: f64x2, b: f64x2) -> f64x2 { addsubpd(a, b) } @@ -26,7 +26,7 @@ pub unsafe fn _mm_addsub_pd(a: f64x2, b: f64x2) -> f64x2 { /// floating-point elements in `a` and `b`, and pack the results. #[inline(always)] #[target_feature = "+sse3"] -#[cfg_attr(test, assert_instr(hadd))] +#[cfg_attr(test, assert_instr(haddpd))] pub unsafe fn _mm_hadd_pd(a: f64x2, b: f64x2) -> f64x2 { haddpd(a, b) } @@ -35,7 +35,7 @@ pub unsafe fn _mm_hadd_pd(a: f64x2, b: f64x2) -> f64x2 { /// floating-point elements in `a` and `b`, and pack the results. #[inline(always)] #[target_feature = "+sse3"] -#[cfg_attr(test, assert_instr(hadd))] +#[cfg_attr(test, assert_instr(haddps))] pub unsafe fn _mm_hadd_ps(a: f32x4, b: f32x4) -> f32x4 { haddps(a, b) } @@ -44,11 +44,20 @@ pub unsafe fn _mm_hadd_ps(a: f32x4, b: f32x4) -> f32x4 { /// floating-point elements in `a` and `b`, and pack the results. #[inline(always)] #[target_feature = "+sse3"] -#[cfg_attr(test, assert_instr(hsub))] +#[cfg_attr(test, assert_instr(hsubpd))] pub unsafe fn _mm_hsub_pd(a: f64x2, b: f64x2) -> f64x2 { hsubpd(a, b) } +/// Horizontally add adjacent pairs of single-precision (32-bit) +/// floating-point elements in `a` and `b`, and pack the results. +#[inline(always)] +#[target_feature = "+sse3"] +#[cfg_attr(test, assert_instr(hsubps))] +pub unsafe fn _mm_hsub_ps(a: f32x4, b: f32x4) -> f32x4 { + hsubps(a, b) +} + /// Load 128-bits of integer data from unaligned memory. /// This intrinsic may perform better than `_mm_loadu_si128` /// when the data crosses a cache line boundary. @@ -71,6 +80,8 @@ extern { fn haddps(a: f32x4, b: f32x4) -> f32x4; #[link_name = "llvm.x86.sse3.hsub.pd"] fn hsubpd(a: f64x2, b: f64x2) -> f64x2; + #[link_name = "llvm.x86.sse3.hsub.ps"] + fn hsubps(a: f32x4, b: f32x4) -> f32x4; #[link_name = "llvm.x86.sse3.ldu.dq"] fn lddqu(mem_addr: *const i8) -> __m128i; } @@ -123,6 +134,14 @@ mod tests { assert_eq!(r, f64x2::new(-6.0, -120.0)); } + #[simd_test = "sse3"] + unsafe fn _mm_hsub_ps() { + let a = f32x4::new(-1.0, 5.0, 0.0, -10.0); + let b = f32x4::new(-100.0, 20.0, 0.0, -5.0); + let r = sse3::_mm_hsub_ps(a, b); + assert_eq!(r, f32x4::new(-6.0, 10.0, -120.0, 5.0)); + } + #[simd_test = "sse3"] unsafe fn _mm_lddqu_si128() { let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);