* avx: _mm256_andnot_pd, _mm256_andnot_ps

* avx: _mm256_blendv_pd

* avx: _mm256_blend_pd with no assert_instr

With assert_instr: too many instructions in the disassembly

* avx: _mm256_blendv_ps

* avx: _mm256_hadd_pd

* avx: _mm256_hadd_ps

* avx: _mm256_hsub_pd

* avx: _mm256_hsub_ps

* avx: _mm256_xor_pd

* avx: _mm256_xor_ps

* avx: _mm256_cvtepi32_pd

* avx: _mm256_cvtepi32_ps

* avx: _mm256_cvtpd_ps

* avx: _mm256_cvtps_epi32

* avx: _mm256_cvtps_pd

* avx: _mm256_cvttpd_epi32

* avx: _mm256_cvtpd_epi32

* avx: replace simd_cast by proper instrunction

* avx: _mm256_cvttps_epi32

* avx: _mm256_extractf128_ps, _mm256_undefined_ps

* avx: _mm256_extractf128_pd, _mm256_undefined_pd

* avx: _mm256_extractf128_si256, _mm256_undefined_si256

* avx: _mm256_extract_epi8

* avx: _mm256_extract_epi16

* avx: _mm256_extract_epi32

* avx: _mm256_extract_epi64

* avx: _mm256_zeroall

* avx: _mm256_zeroupper

* avx: _mm256_permutevar_ps

* avx: _mm_permutevar_ps

* avx: replace simd_cast by as_*

* avx: _mm256_permute_ps

* avx: _mm256_dp_ps

* avx: _mm256_shuffle_pd

* avx: _mm256_shuffle_pd, wrong instruction generated

* implement _mm256_hadd_ps and _mm256_hadd_pd

* avx: implement _mm256_hsub_pd and _mm256_hsub_ps

* assert_instr: raise the limit up to 30 instructions
This commit is contained in:
gwenn 2017-10-05 20:42:29 +02:00 committed by Alex Crichton
parent b421e9210c
commit ee0f165e8e
2 changed files with 791 additions and 1 deletions

View file

@ -3,6 +3,8 @@ use std::mem;
#[cfg(test)]
use stdsimd_test::assert_instr;
use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8};
use v128::{f32x4, f64x2, i32x4, i64x2};
use v256::*;
/// Add packed double-precision (64-bit) floating-point elements
@ -68,6 +70,71 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 {
mem::transmute(a | b)
}
/// Shuffle double-precision (64-bit) floating-point elements within 128-bit
/// lanes using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME
pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle4 {
($a:expr, $b:expr, $c:expr, $d:expr) => {
simd_shuffle4(a, b, [$a, $b, $c, $d]);
}
}
macro_rules! shuffle3 {
($a:expr, $b: expr, $c: expr) => {
match (imm8 >> 3) & 0x1 {
0 => shuffle4!($a, $b, $c, 6),
_ => shuffle4!($a, $b, $c, 7),
}
}
}
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
match (imm8 >> 2) & 0x1 {
0 => shuffle3!($a, $b, 2),
_ => shuffle3!($a, $b, 3),
}
}
}
macro_rules! shuffle1 {
($a:expr) => {
match (imm8 >> 1) & 0x1 {
0 => shuffle2!($a, 4),
_ => shuffle2!($a, 5),
}
}
}
match (imm8 >> 0) & 0x1 {
0 => shuffle1!(0),
_ => shuffle1!(1),
}
}
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
/// and then AND with `b`.
#[inline(always)]
#[target_feature = "+avx"]
// Should be 'vandnpd' instruction.
#[cfg_attr(test, assert_instr(vandnps))]
pub unsafe fn _mm256_andnot_pd(a: f64x4, b: f64x4) -> f64x4 {
let a: u64x4 = mem::transmute(a);
let b: u64x4 = mem::transmute(b);
mem::transmute((!a) & b)
}
/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
/// and then AND with `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vandnps))]
pub unsafe fn _mm256_andnot_ps(a: f32x8, b: f32x8) -> f32x8 {
let a: u32x8 = mem::transmute(a);
let b: u32x8 = mem::transmute(b);
mem::transmute((!a) & b)
}
/// Compare packed double-precision (64-bit) floating-point elements
/// in `a` and `b`, and return packed maximum values
#[inline(always)]
@ -274,6 +341,393 @@ pub unsafe fn _mm256_sqrt_pd(a: f64x4) -> f64x4 {
sqrtpd256(a)
}
/// Blend packed double-precision (64-bit) floating-point elements from
/// `a` and `b` using control mask `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
pub unsafe fn _mm256_blend_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! blend4 {
($a:expr, $b:expr, $c:expr, $d:expr) => {
simd_shuffle4(a, b, [$a, $b, $c, $d]);
}
}
macro_rules! blend3 {
($a:expr, $b: expr, $c: expr) => {
match imm8 & 0x8 {
0 => blend4!($a, $b, $c, 3),
_ => blend4!($a, $b, $c, 7),
}
}
}
macro_rules! blend2 {
($a:expr, $b:expr) => {
match imm8 & 0x4 {
0 => blend3!($a, $b, 2),
_ => blend3!($a, $b, 6),
}
}
}
macro_rules! blend1 {
($a:expr) => {
match imm8 & 0x2 {
0 => blend2!($a, 1),
_ => blend2!($a, 5),
}
}
}
match imm8 & 0x1 {
0 => blend1!(0),
_ => blend1!(4),
}
}
/// Blend packed double-precision (64-bit) floating-point elements from
/// `a` and `b` using `c` as a mask.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vblendvpd))]
pub unsafe fn _mm256_blendv_pd(a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
vblendvpd(a, b, c)
}
/// Blend packed single-precision (32-bit) floating-point elements from
/// `a` and `b` using `c` as a mask.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vblendvps))]
pub unsafe fn _mm256_blendv_ps(a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
vblendvps(a, b, c)
}
/// Conditionally multiply the packed single-precision (32-bit) floating-point
/// elements in `a` and `b` using the high 4 bits in `imm8`,
/// sum the four products, and conditionally return the sum
/// using the low 4 bits of `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))]
pub unsafe fn _mm256_dp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
macro_rules! call {
($imm8:expr) => { vdpps(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
/// Horizontal addition of adjacent pairs in the two packed vectors
/// of 4 64-bit floating points `a` and `b`.
/// In the result, sums of elements from `a` are returned in even locations,
/// while sums of elements from `b` are returned in odd locations.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vhaddpd))]
pub unsafe fn _mm256_hadd_pd(a: f64x4, b: f64x4) -> f64x4 {
vhaddpd(a, b)
}
/// Horizontal addition of adjacent pairs in the two packed vectors
/// of 8 32-bit floating points `a` and `b`.
/// In the result, sums of elements from `a` are returned in locations of
/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
/// 2, 3, 6, 7.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vhaddps))]
pub unsafe fn _mm256_hadd_ps(a: f32x8, b: f32x8) -> f32x8 {
vhaddps(a, b)
}
/// Horizontal subtraction of adjacent pairs in the two packed vectors
/// of 4 64-bit floating points `a` and `b`.
/// In the result, sums of elements from `a` are returned in even locations,
/// while sums of elements from `b` are returned in odd locations.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vhsubpd))]
pub unsafe fn _mm256_hsub_pd(a: f64x4, b: f64x4) -> f64x4 {
vhsubpd(a, b)
}
/// Horizontal subtraction of adjacent pairs in the two packed vectors
/// of 8 32-bit floating points `a` and `b`.
/// In the result, sums of elements from `a` are returned in locations of
/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
/// 2, 3, 6, 7.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vhsubps))]
pub unsafe fn _mm256_hsub_ps(a: f32x8, b: f32x8) -> f32x8 {
vhsubps(a, b)
}
/// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
/// elements in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
// FIXME Should be 'vxorpd' instruction.
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm256_xor_pd(a: f64x4, b: f64x4) -> f64x4 {
let a: u64x4 = mem::transmute(a);
let b: u64x4 = mem::transmute(b);
mem::transmute(a ^ b)
}
/// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
/// elements in `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
let a: u32x8 = mem::transmute(a);
let b: u32x8 = mem::transmute(b);
mem::transmute(a ^ b)
}
/// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
/// floating-point elements.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm256_cvtepi32_pd(a: i32x4) -> f64x4 {
simd_cast(a)
}
/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
/// floating-point elements.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm256_cvtepi32_ps(a: i32x8) -> f32x8 {
vcvtdq2ps(a)
}
/// Convert packed double-precision (64-bit) floating-point elements in `a`
/// to packed single-precision (32-bit) floating-point elements.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm256_cvtpd_ps(a: f64x4) -> f32x4 {
vcvtpd2ps(a)
}
/// Convert packed single-precision (32-bit) floating-point elements in `a`
/// to packed 32-bit integers.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm256_cvtps_epi32(a: f32x8) -> i32x8 {
vcvtps2dq(a)
}
/// Convert packed single-precision (32-bit) floating-point elements in `a`
/// to packed double-precision (64-bit) floating-point elements.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvtps2pd))]
pub unsafe fn _mm256_cvtps_pd(a: f32x4) -> f64x4 {
a.as_f64x4()
}
/// Convert packed double-precision (64-bit) floating-point elements in `a`
/// to packed 32-bit integers with truncation.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm256_cvttpd_epi32(a: f64x4) -> i32x4 {
vcvttpd2dq(a)
}
/// Convert packed double-precision (64-bit) floating-point elements in `a`
/// to packed 32-bit integers.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm256_cvtpd_epi32(a: f64x4) -> i32x4 {
vcvtpd2dq(a)
}
/// Convert packed single-precision (32-bit) floating-point elements in `a`
/// to packed 32-bit integers with truncation.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm256_cvttps_epi32(a: f32x8) -> i32x8 {
vcvttps2dq(a)
}
/// Extract 128 bits (composed of 4 packed single-precision (32-bit)
/// floating-point elements) from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vextractf128))]
pub unsafe fn _mm256_extractf128_ps(a: f32x8, imm8: i32) -> f32x4 {
match imm8 & 1 {
0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
_ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
}
}
/// Extract 128 bits (composed of 2 packed double-precision (64-bit)
/// floating-point elements) from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vextractf128))]
pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> f64x2 {
match imm8 & 1 {
0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
_ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]),
}
}
/// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vextractf128))]
pub unsafe fn _mm256_extractf128_si256(a: i64x4, imm8: i32) -> i64x2 {
match imm8 & 1 {
0 => simd_shuffle2(a, _mm256_undefined_si256(), [0, 1]),
_ => simd_shuffle2(a, _mm256_undefined_si256(), [2, 3]),
}
}
/// Extract an 8-bit integer from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 {
a.extract(imm8 as u32 & 31) as i32
}
/// Extract a 16-bit integer from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 {
a.extract(imm8 as u32 & 15) as i32
}
/// Extract a 32-bit integer from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
a.extract(imm8 as u32 & 7) as i32
}
/// Extract a 64-bit integer from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i32 {
a.extract(imm8 as u32 & 3) as i32
}
/// Zero the contents of all XMM or YMM registers.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vzeroall))]
pub unsafe fn _mm256_zeroall() {
vzeroall()
}
/// Zero the upper 128 bits of all YMM registers;
/// the lower 128-bits of the registers are unmodified.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vzeroupper))]
pub unsafe fn _mm256_zeroupper() {
vzeroupper()
}
/// Shuffle single-precision (32-bit) floating-point elements in `a`
/// within 128-bit lanes using the control in `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm256_permutevar_ps(a: f32x8, b: i32x8) -> f32x8 {
vpermilps256(a, b)
}
/// Shuffle single-precision (32-bit) floating-point elements in `a`
/// using the control in `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
vpermilps(a, b)
}
/// Shuffle single-precision (32-bit) floating-point elements in `a`
/// within 128-bit lanes using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
let imm8 = (imm8 & 0xFF) as u8;
const fn add4(x: u32) -> u32 { x + 4 }
macro_rules! shuffle4 {
($a:expr, $b:expr, $c:expr, $d:expr) => {
simd_shuffle8(a, _mm256_undefined_ps(), [
$a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d)
])
}
}
macro_rules! shuffle3 {
($a:expr, $b:expr, $c:expr) => {
match (imm8 >> 6) & 0b11 {
0b00 => shuffle4!($a, $b, $c, 0),
0b01 => shuffle4!($a, $b, $c, 1),
0b10 => shuffle4!($a, $b, $c, 2),
_ => shuffle4!($a, $b, $c, 3),
}
}
}
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
match (imm8 >> 4) & 0b11 {
0b00 => shuffle3!($a, $b, 0),
0b01 => shuffle3!($a, $b, 1),
0b10 => shuffle3!($a, $b, 2),
_ => shuffle3!($a, $b, 3),
}
}
}
macro_rules! shuffle1 {
($a:expr) => {
match (imm8 >> 2) & 0b11 {
0b00 => shuffle2!($a, 0),
0b01 => shuffle2!($a, 1),
0b10 => shuffle2!($a, 2),
_ => shuffle2!($a, 3),
}
}
}
match (imm8 >> 0) & 0b11 {
0b00 => shuffle1!(0),
0b01 => shuffle1!(1),
0b10 => shuffle1!(2),
_ => shuffle1!(3),
}
}
/// Return vector of type `f32x8` with undefined elements.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_undefined_ps() -> f32x8 {
mem::uninitialized()
}
/// Return vector of type `f64x4` with undefined elements.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_undefined_pd() -> f64x4 {
mem::uninitialized()
}
/// Return vector of type `i64x4` with undefined elements.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_undefined_si256() -> i64x4 {
mem::uninitialized()
}
/// LLVM intrinsics used in the above functions
#[allow(improper_ctypes)]
extern "C" {
@ -297,12 +751,47 @@ extern "C" {
fn sqrtpd256(a: f64x4) -> f64x4;
#[link_name = "llvm.x86.avx.sqrt.ps.256"]
fn sqrtps256(a: f32x8) -> f32x8;
#[link_name = "llvm.x86.avx.blendv.pd.256"]
fn vblendvpd(a: f64x4, b: f64x4, c: f64x4) -> f64x4;
#[link_name = "llvm.x86.avx.blendv.ps.256"]
fn vblendvps(a: f32x8, b: f32x8, c: f32x8) -> f32x8;
#[link_name = "llvm.x86.avx.dp.ps.256"]
fn vdpps(a: f32x8, b: f32x8, imm8: i32) -> f32x8;
#[link_name = "llvm.x86.avx.hadd.pd.256"]
fn vhaddpd(a: f64x4, b: f64x4) -> f64x4;
#[link_name = "llvm.x86.avx.hadd.ps.256"]
fn vhaddps(a: f32x8, b: f32x8) -> f32x8;
#[link_name = "llvm.x86.avx.hsub.pd.256"]
fn vhsubpd(a: f64x4, b: f64x4) -> f64x4;
#[link_name = "llvm.x86.avx.hsub.ps.256"]
fn vhsubps(a: f32x8, b: f32x8) -> f32x8;
#[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
fn vcvtdq2ps(a: i32x8) -> f32x8;
#[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
fn vcvtpd2ps(a: f64x4) -> f32x4;
#[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
fn vcvtps2dq(a: f32x8) -> i32x8;
#[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
fn vcvttpd2dq(a: f64x4) -> i32x4;
#[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
fn vcvtpd2dq(a: f64x4) -> i32x4;
#[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
fn vcvttps2dq(a: f32x8) -> i32x8;
#[link_name = "llvm.x86.avx.vzeroall"]
fn vzeroall();
#[link_name = "llvm.x86.avx.vzeroupper"]
fn vzeroupper();
#[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
fn vpermilps256(a: f32x8, b: i32x8) -> f32x8;
#[link_name = "llvm.x86.avx.vpermilvar.ps"]
fn vpermilps(a: f32x4, b: i32x4) -> f32x4;
}
#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
use v128::{f32x4, f64x2, i32x4, i64x2};
use v256::*;
use x86::avx;
@ -360,6 +849,31 @@ mod tests {
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_shuffle_pd() {
let a = f64x4::new(1.0, 4.0, 5.0, 8.0);
let b = f64x4::new(2.0, 3.0, 6.0, 7.0);
let r = avx::_mm256_shuffle_pd(a, b, 0xF);
let e = f64x4::new(4.0, 3.0, 8.0, 7.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_andnot_pd() {
let a = f64x4::splat(0.0);
let b = f64x4::splat(0.6);
let r = avx::_mm256_andnot_pd(a, b);
assert_eq!(r, b);
}
#[simd_test = "avx"]
unsafe fn _mm256_andnot_ps() {
let a = f32x8::splat(0.0);
let b = f32x8::splat(0.6);
let r = avx::_mm256_andnot_ps(a, b);
assert_eq!(r, b);
}
#[simd_test = "avx"]
unsafe fn _mm256_max_pd() {
let a = f64x4::new(1.0, 4.0, 5.0, 8.0);
@ -543,4 +1057,280 @@ mod tests {
let e = f64x4::new(1.0, 3.0, 8.0, 5.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_blend_pd() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm256_blend_pd(a, b, 0x0);
assert_eq!(r, f64x4::new(4.0, 9.0, 16.0, 25.0));
let r = avx::_mm256_blend_pd(a, b, 0x3);
assert_eq!(r, f64x4::new(4.0, 3.0, 16.0, 25.0));
let r = avx::_mm256_blend_pd(a, b, 0xF);
assert_eq!(r, f64x4::new(4.0, 3.0, 2.0, 5.0));
}
#[simd_test = "avx"]
unsafe fn _mm256_blendv_pd() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
let c = f64x4::new(0.0, 0.0, !0 as f64, !0 as f64);
let r = avx::_mm256_blendv_pd(a, b, c);
let e = f64x4::new(4.0, 9.0, 2.0, 5.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_blendv_ps() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let c = f32x8::new(0.0, 0.0, 0.0, 0.0, !0 as f32, !0 as f32, !0 as f32, !0 as f32);
let r = avx::_mm256_blendv_ps(a, b, c);
let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_dp_ps() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let r = avx::_mm256_dp_ps(a, b, 0xFF);
let e = f32x8::new(200.0, 200.0, 200.0, 200.0, 2387.0, 2387.0, 2387.0, 2387.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_hadd_pd() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm256_hadd_pd(a, b);
let e = f64x4::new(13.0, 7.0, 41.0, 7.0);
assert_eq!(r, e);
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_hadd_pd(a, b);
let e = f64x4::new(3.0, 11.0, 7.0, 15.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_hadd_ps() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let r = avx::_mm256_hadd_ps(a, b);
let e = f32x8::new(13.0, 41.0, 7.0, 7.0, 13.0, 41.0, 17.0, 114.0);
assert_eq!(r, e);
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_hadd_ps(a, b);
let e = f32x8::new(3.0, 7.0, 11.0, 15.0, 3.0, 7.0, 11.0, 15.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_hsub_pd() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm256_hsub_pd(a, b);
let e = f64x4::new(-5.0, 1.0, -9.0, -3.0);
assert_eq!(r, e);
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_hsub_pd(a, b);
let e = f64x4::new(-1., -1., -1., -1.);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_hsub_ps() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let r = avx::_mm256_hsub_ps(a, b);
let e = f32x8::new(-5.0, -9.0, 1.0, -3.0, -5.0, -9.0, -1.0, 14.0);
assert_eq!(r, e);
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_hsub_ps(a, b);
let e = f32x8::new(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_xor_pd() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let b = f64x4::splat(0.0);
let r = avx::_mm256_xor_pd(a, b);
assert_eq!(r, a);
}
#[simd_test = "avx"]
unsafe fn _mm256_xor_ps() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let b = f32x8::splat(0.0);
let r = avx::_mm256_xor_ps(a, b);
assert_eq!(r, a);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtepi32_pd() {
let a = i32x4::new(4, 9, 16, 25);
let r = avx::_mm256_cvtepi32_pd(a);
let e = f64x4::new(4.0, 9.0, 16.0, 25.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtepi32_ps() {
let a = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25);
let r = avx::_mm256_cvtepi32_ps(a);
let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtpd_ps() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_cvtpd_ps(a);
let e = f32x4::new(4.0, 9.0, 16.0, 25.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtps_epi32() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_cvtps_epi32(a);
let e = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtps_pd() {
let a = f32x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_cvtps_pd(a);
let e = f64x4::new(4.0, 9.0, 16.0, 25.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvttpd_epi32() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_cvttpd_epi32(a);
let e = i32x4::new(4, 9, 16, 25);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtpd_epi32() {
let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_cvtpd_epi32(a);
let e = i32x4::new(4, 9, 16, 25);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvttps_epi32() {
let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_cvttps_epi32(a);
let e = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_extractf128_ps() {
let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let r = avx::_mm256_extractf128_ps(a, 0);
let e = f32x4::new(4.0, 3.0, 2.0, 5.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_extractf128_pd() {
let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm256_extractf128_pd(a, 0);
let e = f64x2::new(4.0, 3.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_extractf128_si256() {
let a = i64x4::new(4, 3, 2, 5);
let r = avx::_mm256_extractf128_si256(a, 0);
let e = i64x2::new(4, 3);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_extract_epi8() {
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
let r = avx::_mm256_extract_epi8(a, 0);
assert_eq!(r, 1);
}
#[simd_test = "avx"]
unsafe fn _mm256_extract_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let r = avx::_mm256_extract_epi16(a, 0);
assert_eq!(r, 0);
}
#[simd_test = "avx"]
unsafe fn _mm256_extract_epi32() {
let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx::_mm256_extract_epi32(a, 0);
assert_eq!(r, 1);
}
#[simd_test = "avx"]
unsafe fn _mm256_extract_epi64() {
let a = i64x4::new(0, 1, 2, 3);
let r = avx::_mm256_extract_epi64(a, 3);
assert_eq!(r, 3);
}
#[simd_test = "avx"]
unsafe fn _mm256_zeroall() {
avx::_mm256_zeroall();
}
#[simd_test = "avx"]
unsafe fn _mm256_zeroupper() {
avx::_mm256_zeroupper();
}
#[simd_test = "avx"]
unsafe fn _mm256_permutevar_ps() {
let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx::_mm256_permutevar_ps(a, b);
let e = f32x8::new(3.0, 2.0, 5.0, 4.0, 9.0, 64.0, 50.0, 8.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_permutevar_ps() {
let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
let b = i32x4::new(1, 2, 3, 4);
let r = avx::_mm_permutevar_ps(a, b);
let e = f32x4::new(3.0, 2.0, 5.0, 4.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_permute_ps() {
let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let r = avx::_mm256_permute_ps(a, 0x1b);
let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0);
assert_eq!(r, e);
}
}

View file

@ -271,7 +271,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
}
}
let probably_only_one_instruction = function.instrs.len() < 20;
let probably_only_one_instruction = function.instrs.len() < 30;
if found && probably_only_one_instruction {
return