diff --git a/library/stdarch/src/x86/avx.rs b/library/stdarch/src/x86/avx.rs index 91cf7cff08bc..60a4aeea2e1b 100644 --- a/library/stdarch/src/x86/avx.rs +++ b/library/stdarch/src/x86/avx.rs @@ -3,6 +3,8 @@ use std::mem; #[cfg(test)] use stdsimd_test::assert_instr; +use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8}; +use v128::{f32x4, f64x2, i32x4, i64x2}; use v256::*; /// Add packed double-precision (64-bit) floating-point elements @@ -68,6 +70,71 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 { mem::transmute(a | b) } +/// Shuffle double-precision (64-bit) floating-point elements within 128-bit +/// lanes using the control in `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME +pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle4(a, b, [$a, $b, $c, $d]); + } + } + macro_rules! shuffle3 { + ($a:expr, $b: expr, $c: expr) => { + match (imm8 >> 3) & 0x1 { + 0 => shuffle4!($a, $b, $c, 6), + _ => shuffle4!($a, $b, $c, 7), + } + } + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 2) & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 1) & 0x1 { + 0 => shuffle2!($a, 4), + _ => shuffle2!($a, 5), + } + } + } + match (imm8 >> 0) & 0x1 { + 0 => shuffle1!(0), + _ => shuffle1!(1), + } +} + +/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a` +/// and then AND with `b`. +#[inline(always)] +#[target_feature = "+avx"] +// Should be 'vandnpd' instruction. +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm256_andnot_pd(a: f64x4, b: f64x4) -> f64x4 { + let a: u64x4 = mem::transmute(a); + let b: u64x4 = mem::transmute(b); + mem::transmute((!a) & b) +} + +/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` +/// and then AND with `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vandnps))] +pub unsafe fn _mm256_andnot_ps(a: f32x8, b: f32x8) -> f32x8 { + let a: u32x8 = mem::transmute(a); + let b: u32x8 = mem::transmute(b); + mem::transmute((!a) & b) +} + /// Compare packed double-precision (64-bit) floating-point elements /// in `a` and `b`, and return packed maximum values #[inline(always)] @@ -274,6 +341,393 @@ pub unsafe fn _mm256_sqrt_pd(a: f64x4) -> f64x4 { sqrtpd256(a) } +/// Blend packed double-precision (64-bit) floating-point elements from +/// `a` and `b` using control mask `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))] +pub unsafe fn _mm256_blend_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! blend4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle4(a, b, [$a, $b, $c, $d]); + } + } + macro_rules! blend3 { + ($a:expr, $b: expr, $c: expr) => { + match imm8 & 0x8 { + 0 => blend4!($a, $b, $c, 3), + _ => blend4!($a, $b, $c, 7), + } + } + } + macro_rules! blend2 { + ($a:expr, $b:expr) => { + match imm8 & 0x4 { + 0 => blend3!($a, $b, 2), + _ => blend3!($a, $b, 6), + } + } + } + macro_rules! blend1 { + ($a:expr) => { + match imm8 & 0x2 { + 0 => blend2!($a, 1), + _ => blend2!($a, 5), + } + } + } + match imm8 & 0x1 { + 0 => blend1!(0), + _ => blend1!(4), + } +} + +/// Blend packed double-precision (64-bit) floating-point elements from +/// `a` and `b` using `c` as a mask. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vblendvpd))] +pub unsafe fn _mm256_blendv_pd(a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + vblendvpd(a, b, c) +} + +/// Blend packed single-precision (32-bit) floating-point elements from +/// `a` and `b` using `c` as a mask. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vblendvps))] +pub unsafe fn _mm256_blendv_ps(a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + vblendvps(a, b, c) +} + +/// Conditionally multiply the packed single-precision (32-bit) floating-point +/// elements in `a` and `b` using the high 4 bits in `imm8`, +/// sum the four products, and conditionally return the sum +/// using the low 4 bits of `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))] +pub unsafe fn _mm256_dp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 { + macro_rules! call { + ($imm8:expr) => { vdpps(a, b, $imm8) } + } + constify_imm8!(imm8, call) +} + +/// Horizontal addition of adjacent pairs in the two packed vectors +/// of 4 64-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in even locations, +/// while sums of elements from `b` are returned in odd locations. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vhaddpd))] +pub unsafe fn _mm256_hadd_pd(a: f64x4, b: f64x4) -> f64x4 { + vhaddpd(a, b) +} + +/// Horizontal addition of adjacent pairs in the two packed vectors +/// of 8 32-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in locations of +/// indices 0, 1, 4, 5; while sums of elements from `b` are locations +/// 2, 3, 6, 7. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vhaddps))] +pub unsafe fn _mm256_hadd_ps(a: f32x8, b: f32x8) -> f32x8 { + vhaddps(a, b) +} + +/// Horizontal subtraction of adjacent pairs in the two packed vectors +/// of 4 64-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in even locations, +/// while sums of elements from `b` are returned in odd locations. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vhsubpd))] +pub unsafe fn _mm256_hsub_pd(a: f64x4, b: f64x4) -> f64x4 { + vhsubpd(a, b) +} + +/// Horizontal subtraction of adjacent pairs in the two packed vectors +/// of 8 32-bit floating points `a` and `b`. +/// In the result, sums of elements from `a` are returned in locations of +/// indices 0, 1, 4, 5; while sums of elements from `b` are locations +/// 2, 3, 6, 7. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vhsubps))] +pub unsafe fn _mm256_hsub_ps(a: f32x8, b: f32x8) -> f32x8 { + vhsubps(a, b) +} + +/// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +/// elements in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +// FIXME Should be 'vxorpd' instruction. +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm256_xor_pd(a: f64x4, b: f64x4) -> f64x4 { + let a: u64x4 = mem::transmute(a); + let b: u64x4 = mem::transmute(b); + mem::transmute(a ^ b) +} + +/// Compute the bitwise XOR of packed single-precision (32-bit) floating-point +/// elements in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 { + let a: u32x8 = mem::transmute(a); + let b: u32x8 = mem::transmute(b); + mem::transmute(a ^ b) +} + +/// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) +/// floating-point elements. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvtdq2pd))] +pub unsafe fn _mm256_cvtepi32_pd(a: i32x4) -> f64x4 { + simd_cast(a) +} + +/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit) +/// floating-point elements. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvtdq2ps))] +pub unsafe fn _mm256_cvtepi32_ps(a: i32x8) -> f32x8 { + vcvtdq2ps(a) +} + +/// Convert packed double-precision (64-bit) floating-point elements in `a` +/// to packed single-precision (32-bit) floating-point elements. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvtpd2ps))] +pub unsafe fn _mm256_cvtpd_ps(a: f64x4) -> f32x4 { + vcvtpd2ps(a) +} + +/// Convert packed single-precision (32-bit) floating-point elements in `a` +/// to packed 32-bit integers. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvtps2dq))] +pub unsafe fn _mm256_cvtps_epi32(a: f32x8) -> i32x8 { + vcvtps2dq(a) +} + +/// Convert packed single-precision (32-bit) floating-point elements in `a` +/// to packed double-precision (64-bit) floating-point elements. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvtps2pd))] +pub unsafe fn _mm256_cvtps_pd(a: f32x4) -> f64x4 { + a.as_f64x4() +} + +/// Convert packed double-precision (64-bit) floating-point elements in `a` +/// to packed 32-bit integers with truncation. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvttpd2dq))] +pub unsafe fn _mm256_cvttpd_epi32(a: f64x4) -> i32x4 { + vcvttpd2dq(a) +} + +/// Convert packed double-precision (64-bit) floating-point elements in `a` +/// to packed 32-bit integers. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvtpd2dq))] +pub unsafe fn _mm256_cvtpd_epi32(a: f64x4) -> i32x4 { + vcvtpd2dq(a) +} + +/// Convert packed single-precision (32-bit) floating-point elements in `a` +/// to packed 32-bit integers with truncation. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcvttps2dq))] +pub unsafe fn _mm256_cvttps_epi32(a: f32x8) -> i32x8 { + vcvttps2dq(a) +} + +/// Extract 128 bits (composed of 4 packed single-precision (32-bit) +/// floating-point elements) from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vextractf128))] +pub unsafe fn _mm256_extractf128_ps(a: f32x8, imm8: i32) -> f32x4 { + match imm8 & 1 { + 0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]), + _ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]), + } +} + +/// Extract 128 bits (composed of 2 packed double-precision (64-bit) +/// floating-point elements) from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vextractf128))] +pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> f64x2 { + match imm8 & 1 { + 0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]), + _ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]), + } +} + +/// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vextractf128))] +pub unsafe fn _mm256_extractf128_si256(a: i64x4, imm8: i32) -> i64x2 { + match imm8 & 1 { + 0 => simd_shuffle2(a, _mm256_undefined_si256(), [0, 1]), + _ => simd_shuffle2(a, _mm256_undefined_si256(), [2, 3]), + } +} + +/// Extract an 8-bit integer from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 { + a.extract(imm8 as u32 & 31) as i32 +} + +/// Extract a 16-bit integer from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 { + a.extract(imm8 as u32 & 15) as i32 +} + +/// Extract a 32-bit integer from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 { + a.extract(imm8 as u32 & 7) as i32 +} + +/// Extract a 64-bit integer from `a`, selected with `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i32 { + a.extract(imm8 as u32 & 3) as i32 +} + +/// Zero the contents of all XMM or YMM registers. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vzeroall))] +pub unsafe fn _mm256_zeroall() { + vzeroall() +} + +/// Zero the upper 128 bits of all YMM registers; +/// the lower 128-bits of the registers are unmodified. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vzeroupper))] +pub unsafe fn _mm256_zeroupper() { + vzeroupper() +} + +/// Shuffle single-precision (32-bit) floating-point elements in `a` +/// within 128-bit lanes using the control in `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilps))] +pub unsafe fn _mm256_permutevar_ps(a: f32x8, b: i32x8) -> f32x8 { + vpermilps256(a, b) +} + +/// Shuffle single-precision (32-bit) floating-point elements in `a` +/// using the control in `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilps))] +pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 { + vpermilps(a, b) +} + +/// Shuffle single-precision (32-bit) floating-point elements in `a` +/// within 128-bit lanes using the control in `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] +pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { + let imm8 = (imm8 & 0xFF) as u8; + const fn add4(x: u32) -> u32 { x + 4 } + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle8(a, _mm256_undefined_ps(), [ + $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d) + ]) + } + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr) => { + match (imm8 >> 6) & 0b11 { + 0b00 => shuffle4!($a, $b, $c, 0), + 0b01 => shuffle4!($a, $b, $c, 1), + 0b10 => shuffle4!($a, $b, $c, 2), + _ => shuffle4!($a, $b, $c, 3), + } + } + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 4) & 0b11 { + 0b00 => shuffle3!($a, $b, 0), + 0b01 => shuffle3!($a, $b, 1), + 0b10 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 2) & 0b11 { + 0b00 => shuffle2!($a, 0), + 0b01 => shuffle2!($a, 1), + 0b10 => shuffle2!($a, 2), + _ => shuffle2!($a, 3), + } + } + } + match (imm8 >> 0) & 0b11 { + 0b00 => shuffle1!(0), + 0b01 => shuffle1!(1), + 0b10 => shuffle1!(2), + _ => shuffle1!(3), + } +} + +/// Return vector of type `f32x8` with undefined elements. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_undefined_ps() -> f32x8 { + mem::uninitialized() +} + +/// Return vector of type `f64x4` with undefined elements. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_undefined_pd() -> f64x4 { + mem::uninitialized() +} + +/// Return vector of type `i64x4` with undefined elements. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_undefined_si256() -> i64x4 { + mem::uninitialized() +} + /// LLVM intrinsics used in the above functions #[allow(improper_ctypes)] extern "C" { @@ -297,12 +751,47 @@ extern "C" { fn sqrtpd256(a: f64x4) -> f64x4; #[link_name = "llvm.x86.avx.sqrt.ps.256"] fn sqrtps256(a: f32x8) -> f32x8; + #[link_name = "llvm.x86.avx.blendv.pd.256"] + fn vblendvpd(a: f64x4, b: f64x4, c: f64x4) -> f64x4; + #[link_name = "llvm.x86.avx.blendv.ps.256"] + fn vblendvps(a: f32x8, b: f32x8, c: f32x8) -> f32x8; + #[link_name = "llvm.x86.avx.dp.ps.256"] + fn vdpps(a: f32x8, b: f32x8, imm8: i32) -> f32x8; + #[link_name = "llvm.x86.avx.hadd.pd.256"] + fn vhaddpd(a: f64x4, b: f64x4) -> f64x4; + #[link_name = "llvm.x86.avx.hadd.ps.256"] + fn vhaddps(a: f32x8, b: f32x8) -> f32x8; + #[link_name = "llvm.x86.avx.hsub.pd.256"] + fn vhsubpd(a: f64x4, b: f64x4) -> f64x4; + #[link_name = "llvm.x86.avx.hsub.ps.256"] + fn vhsubps(a: f32x8, b: f32x8) -> f32x8; + #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] + fn vcvtdq2ps(a: i32x8) -> f32x8; + #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] + fn vcvtpd2ps(a: f64x4) -> f32x4; + #[link_name = "llvm.x86.avx.cvt.ps2dq.256"] + fn vcvtps2dq(a: f32x8) -> i32x8; + #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"] + fn vcvttpd2dq(a: f64x4) -> i32x4; + #[link_name = "llvm.x86.avx.cvt.pd2dq.256"] + fn vcvtpd2dq(a: f64x4) -> i32x4; + #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"] + fn vcvttps2dq(a: f32x8) -> i32x8; + #[link_name = "llvm.x86.avx.vzeroall"] + fn vzeroall(); + #[link_name = "llvm.x86.avx.vzeroupper"] + fn vzeroupper(); + #[link_name = "llvm.x86.avx.vpermilvar.ps.256"] + fn vpermilps256(a: f32x8, b: i32x8) -> f32x8; + #[link_name = "llvm.x86.avx.vpermilvar.ps"] + fn vpermilps(a: f32x4, b: i32x4) -> f32x4; } #[cfg(test)] mod tests { use stdsimd_test::simd_test; + use v128::{f32x4, f64x2, i32x4, i64x2}; use v256::*; use x86::avx; @@ -360,6 +849,31 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "avx"] + unsafe fn _mm256_shuffle_pd() { + let a = f64x4::new(1.0, 4.0, 5.0, 8.0); + let b = f64x4::new(2.0, 3.0, 6.0, 7.0); + let r = avx::_mm256_shuffle_pd(a, b, 0xF); + let e = f64x4::new(4.0, 3.0, 8.0, 7.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_andnot_pd() { + let a = f64x4::splat(0.0); + let b = f64x4::splat(0.6); + let r = avx::_mm256_andnot_pd(a, b); + assert_eq!(r, b); + } + + #[simd_test = "avx"] + unsafe fn _mm256_andnot_ps() { + let a = f32x8::splat(0.0); + let b = f32x8::splat(0.6); + let r = avx::_mm256_andnot_ps(a, b); + assert_eq!(r, b); + } + #[simd_test = "avx"] unsafe fn _mm256_max_pd() { let a = f64x4::new(1.0, 4.0, 5.0, 8.0); @@ -543,4 +1057,280 @@ mod tests { let e = f64x4::new(1.0, 3.0, 8.0, 5.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_blend_pd() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let b = f64x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm256_blend_pd(a, b, 0x0); + assert_eq!(r, f64x4::new(4.0, 9.0, 16.0, 25.0)); + let r = avx::_mm256_blend_pd(a, b, 0x3); + assert_eq!(r, f64x4::new(4.0, 3.0, 16.0, 25.0)); + let r = avx::_mm256_blend_pd(a, b, 0xF); + assert_eq!(r, f64x4::new(4.0, 3.0, 2.0, 5.0)); + } + + #[simd_test = "avx"] + unsafe fn _mm256_blendv_pd() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let b = f64x4::new(4.0, 3.0, 2.0, 5.0); + let c = f64x4::new(0.0, 0.0, !0 as f64, !0 as f64); + let r = avx::_mm256_blendv_pd(a, b, c); + let e = f64x4::new(4.0, 9.0, 2.0, 5.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_blendv_ps() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let c = f32x8::new(0.0, 0.0, 0.0, 0.0, !0 as f32, !0 as f32, !0 as f32, !0 as f32); + let r = avx::_mm256_blendv_ps(a, b, c); + let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_dp_ps() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let r = avx::_mm256_dp_ps(a, b, 0xFF); + let e = f32x8::new(200.0, 200.0, 200.0, 200.0, 2387.0, 2387.0, 2387.0, 2387.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_hadd_pd() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let b = f64x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm256_hadd_pd(a, b); + let e = f64x4::new(13.0, 7.0, 41.0, 7.0); + assert_eq!(r, e); + + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_hadd_pd(a, b); + let e = f64x4::new(3.0, 11.0, 7.0, 15.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_hadd_ps() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let r = avx::_mm256_hadd_ps(a, b); + let e = f32x8::new(13.0, 41.0, 7.0, 7.0, 13.0, 41.0, 17.0, 114.0); + assert_eq!(r, e); + + let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0); + let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_hadd_ps(a, b); + let e = f32x8::new(3.0, 7.0, 11.0, 15.0, 3.0, 7.0, 11.0, 15.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_hsub_pd() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let b = f64x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm256_hsub_pd(a, b); + let e = f64x4::new(-5.0, 1.0, -9.0, -3.0); + assert_eq!(r, e); + + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_hsub_pd(a, b); + let e = f64x4::new(-1., -1., -1., -1.); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_hsub_ps() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let r = avx::_mm256_hsub_ps(a, b); + let e = f32x8::new(-5.0, -9.0, 1.0, -3.0, -5.0, -9.0, -1.0, 14.0); + assert_eq!(r, e); + + let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0); + let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_hsub_ps(a, b); + let e = f32x8::new(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0); + assert_eq!(r, e); + } + + + #[simd_test = "avx"] + unsafe fn _mm256_xor_pd() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let b = f64x4::splat(0.0); + let r = avx::_mm256_xor_pd(a, b); + assert_eq!(r, a); + } + + #[simd_test = "avx"] + unsafe fn _mm256_xor_ps() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let b = f32x8::splat(0.0); + let r = avx::_mm256_xor_ps(a, b); + assert_eq!(r, a); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvtepi32_pd() { + let a = i32x4::new(4, 9, 16, 25); + let r = avx::_mm256_cvtepi32_pd(a); + let e = f64x4::new(4.0, 9.0, 16.0, 25.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvtepi32_ps() { + let a = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25); + let r = avx::_mm256_cvtepi32_ps(a); + let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvtpd_ps() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_cvtpd_ps(a); + let e = f32x4::new(4.0, 9.0, 16.0, 25.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvtps_epi32() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_cvtps_epi32(a); + let e = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvtps_pd() { + let a = f32x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_cvtps_pd(a); + let e = f64x4::new(4.0, 9.0, 16.0, 25.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvttpd_epi32() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_cvttpd_epi32(a); + let e = i32x4::new(4, 9, 16, 25); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvtpd_epi32() { + let a = f64x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_cvtpd_epi32(a); + let e = i32x4::new(4, 9, 16, 25); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_cvttps_epi32() { + let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_cvttps_epi32(a); + let e = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extractf128_ps() { + let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let r = avx::_mm256_extractf128_ps(a, 0); + let e = f32x4::new(4.0, 3.0, 2.0, 5.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extractf128_pd() { + let a = f64x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm256_extractf128_pd(a, 0); + let e = f64x2::new(4.0, 3.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extractf128_si256() { + let a = i64x4::new(4, 3, 2, 5); + let r = avx::_mm256_extractf128_si256(a, 0); + let e = i64x2::new(4, 3); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extract_epi8() { + let a = i8x32::new( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32); + let r = avx::_mm256_extract_epi8(a, 0); + assert_eq!(r, 1); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extract_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let r = avx::_mm256_extract_epi16(a, 0); + assert_eq!(r, 0); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extract_epi32() { + let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx::_mm256_extract_epi32(a, 0); + assert_eq!(r, 1); + } + + #[simd_test = "avx"] + unsafe fn _mm256_extract_epi64() { + let a = i64x4::new(0, 1, 2, 3); + let r = avx::_mm256_extract_epi64(a, 3); + assert_eq!(r, 3); + } + + #[simd_test = "avx"] + unsafe fn _mm256_zeroall() { + avx::_mm256_zeroall(); + } + + #[simd_test = "avx"] + unsafe fn _mm256_zeroupper() { + avx::_mm256_zeroupper(); + } + + #[simd_test = "avx"] + unsafe fn _mm256_permutevar_ps() { + let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx::_mm256_permutevar_ps(a, b); + let e = f32x8::new(3.0, 2.0, 5.0, 4.0, 9.0, 64.0, 50.0, 8.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm_permutevar_ps() { + let a = f32x4::new(4.0, 3.0, 2.0, 5.0); + let b = i32x4::new(1, 2, 3, 4); + let r = avx::_mm_permutevar_ps(a, b); + let e = f32x4::new(3.0, 2.0, 5.0, 4.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm256_permute_ps() { + let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let r = avx::_mm256_permute_ps(a, 0x1b); + let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0); + assert_eq!(r, e); + } } diff --git a/library/stdarch/stdsimd-test/src/lib.rs b/library/stdarch/stdsimd-test/src/lib.rs index 4f1049357f7d..2861e6b5687d 100644 --- a/library/stdarch/stdsimd-test/src/lib.rs +++ b/library/stdarch/stdsimd-test/src/lib.rs @@ -271,7 +271,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { } } - let probably_only_one_instruction = function.instrs.len() < 20; + let probably_only_one_instruction = function.instrs.len() < 30; if found && probably_only_one_instruction { return