diff --git a/library/stdarch/src/x86/sse3.rs b/library/stdarch/src/x86/sse3.rs index 0354f83c1eb7..32231e557e97 100644 --- a/library/stdarch/src/x86/sse3.rs +++ b/library/stdarch/src/x86/sse3.rs @@ -40,6 +40,15 @@ pub unsafe fn _mm_hadd_ps(a: f32x4, b: f32x4) -> f32x4 { haddps(a, b) } +/// Horizontally subtract adjacent pairs of double-precision (64-bit) +/// floating-point elements in `a` and `b`, and pack the results. +#[inline(always)] +#[target_feature = "+sse3"] +#[cfg_attr(test, assert_instr(hsub))] +pub unsafe fn _mm_hsub_pd(a: f64x2, b: f64x2) -> f64x2 { + hsubpd(a, b) +} + /// Load 128-bits of integer data from unaligned memory. /// This intrinsic may perform better than `_mm_loadu_si128` /// when the data crosses a cache line boundary. @@ -60,6 +69,8 @@ extern { fn haddpd(a: f64x2, b: f64x2) -> f64x2; #[link_name = "llvm.x86.sse3.hadd.ps"] fn haddps(a: f32x4, b: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse3.hsub.pd"] + fn hsubpd(a: f64x2, b: f64x2) -> f64x2; #[link_name = "llvm.x86.sse3.ldu.dq"] fn lddqu(mem_addr: *const i8) -> __m128i; } @@ -104,6 +115,14 @@ mod tests { assert_eq!(r, f32x4::new(4.0, -10.0, -80.0, -5.0)); } + #[simd_test = "sse3"] + unsafe fn _mm_hsub_pd() { + let a = f64x2::new(-1.0, 5.0); + let b = f64x2::new(-100.0, 20.0); + let r = sse3::_mm_hsub_pd(a, b); + assert_eq!(r, f64x2::new(-6.0, -120.0)); + } + #[simd_test = "sse3"] unsafe fn _mm_lddqu_si128() { let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);