* avx: _mm_permute_ps and sse: _mm_undefined_ps

* avx: _mm256_permutevar_pdi, _mm_permutevar_pd

* avx: _mm256_permute_pd

* avx: _mm256_shuffle_pd fixed

* avx: _mm_permute_pd, sse2: _mm_undefined_pd

* avx: _mm256_permute2f128_ps

* avx: _mm256_permute2f128_pd

* avx: _mm256_permute2f128_si256

* avx: _mm256_broadcast_ss

* avx: _mm_broadcast_ss

* avx: _mm256_broadcast_sd

* avx: _mm256_broadcast_ps

* avx: _mm256_broadcast_pd

* avx: _mm_cmp_pd

* avx: _mm256_cmp_pd

* avx: _mm_cmp_ps

* avx: _mm256_cmp_ps

* avx: _mm_cmp_sd

* avx: _mm_cmp_ss

* avx: _mm256_insertf128_pd, _mm256_castpd128_pd256

* avx: _mm256_insertf128_si256, _mm256_castsi128_si256

* avx: _mm256_insertf128_ps, _mm256_castps128_ps256

* avx: _mm256_insert_epi8

* avx: _mm256_insert_epi16

* avx: _mm256_insert_epi32

* avx: _mm256_insert_epi64

* Try to fix i586 build

* Fix missing inline and target_feature

* sse: fix _mm_undefined_ps
This commit is contained in:
gwenn 2017-10-09 23:05:36 +02:00 committed by Alex Crichton
parent 807ec089b7
commit 7c88f7c49b
4 changed files with 784 additions and 1 deletions

View file

@ -74,7 +74,7 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 {
/// lanes using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME
#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle4 {
@ -484,6 +484,152 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
mem::transmute(a ^ b)
}
// Equal (ordered, non-signaling)
pub const _CMP_EQ_OQ: u8 = 0x00;
// Less-than (ordered, signaling)
pub const _CMP_LT_OS: u8 = 0x01;
// Less-than-or-equal (ordered, signaling)
pub const _CMP_LE_OS: u8 = 0x02;
// Unordered (non-signaling)
pub const _CMP_UNORD_Q: u8 = 0x03;
// Not-equal (unordered, non-signaling)
pub const _CMP_NEQ_UQ: u8 = 0x04;
// Not-less-than (unordered, signaling)
pub const _CMP_NLT_US: u8 = 0x05;
// Not-less-than-or-equal (unordered, signaling)
pub const _CMP_NLE_US: u8 = 0x06;
// Ordered (non-signaling)
pub const _CMP_ORD_Q: u8 = 0x07;
// Equal (unordered, non-signaling)
pub const _CMP_EQ_UQ: u8 = 0x08;
// Not-greater-than-or-equal (unordered, signaling)
pub const _CMP_NGE_US: u8 = 0x09;
// Not-greater-than (unordered, signaling)
pub const _CMP_NGT_US: u8 = 0x0a;
// False (ordered, non-signaling)
pub const _CMP_FALSE_OQ: u8 = 0x0b;
// Not-equal (ordered, non-signaling)
pub const _CMP_NEQ_OQ: u8 = 0x0c;
// Greater-than-or-equal (ordered, signaling)
pub const _CMP_GE_OS: u8 = 0x0d;
// Greater-than (ordered, signaling)
pub const _CMP_GT_OS: u8 = 0x0e;
// True (unordered, non-signaling)
pub const _CMP_TRUE_UQ: u8 = 0x0f;
// Equal (ordered, signaling)
pub const _CMP_EQ_OS: u8 = 0x10;
// Less-than (ordered, non-signaling)
pub const _CMP_LT_OQ: u8 = 0x11;
// Less-than-or-equal (ordered, non-signaling)
pub const _CMP_LE_OQ: u8 = 0x12;
// Unordered (signaling)
pub const _CMP_UNORD_S: u8 = 0x13;
// Not-equal (unordered, signaling)
pub const _CMP_NEQ_US: u8 = 0x14;
// Not-less-than (unordered, non-signaling)
pub const _CMP_NLT_UQ: u8 = 0x15;
// Not-less-than-or-equal (unordered, non-signaling)
pub const _CMP_NLE_UQ: u8 = 0x16;
// Ordered (signaling)
pub const _CMP_ORD_S: u8 = 0x17;
// Equal (unordered, signaling)
pub const _CMP_EQ_US: u8 = 0x18;
// Not-greater-than-or-equal (unordered, non-signaling)
pub const _CMP_NGE_UQ: u8 = 0x19;
// Not-greater-than (unordered, non-signaling)
pub const _CMP_NGT_UQ: u8 = 0x1a;
// False (ordered, signaling)
pub const _CMP_FALSE_OS: u8 = 0x1b;
// Not-equal (ordered, signaling)
pub const _CMP_NEQ_OS: u8 = 0x1c;
// Greater-than-or-equal (ordered, non-signaling)
pub const _CMP_GE_OQ: u8 = 0x1d;
// Greater-than (ordered, non-signaling)
pub const _CMP_GT_OQ: u8 = 0x1e;
// True (unordered, signaling)
pub const _CMP_TRUE_US: u8 = 0x1f;
/// Compare packed double-precision (64-bit) floating-point
/// elements in `a` and `b` based on the comparison operand
/// specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
macro_rules! call {
($imm8:expr) => { vcmppd(a, b, $imm8) }
}
constify_imm6!(imm8, call)
}
/// Compare packed double-precision (64-bit) floating-point
/// elements in `a` and `b` based on the comparison operand
/// specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
macro_rules! call {
($imm8:expr) => { vcmppd256(a, b, $imm8) }
}
constify_imm6!(imm8, call)
}
/// Compare packed single-precision (32-bit) floating-point
/// elements in `a` and `b` based on the comparison operand
/// specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx,+sse"]
#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
macro_rules! call {
($imm8:expr) => { vcmpps(a, b, $imm8) }
}
constify_imm6!(imm8, call)
}
/// Compare packed single-precision (32-bit) floating-point
/// elements in `a` and `b` based on the comparison operand
/// specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
macro_rules! call {
($imm8:expr) => { vcmpps256(a, b, $imm8) }
}
constify_imm6!(imm8, call)
}
/// Compare the lower double-precision (64-bit) floating-point element in
/// `a` and `b` based on the comparison operand specified by `imm8`,
/// store the result in the lower element of returned vector,
/// and copy the upper element from `a` to the upper element of returned vector.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
macro_rules! call {
($imm8:expr) => { vcmpsd(a, b, $imm8) }
}
constify_imm6!(imm8, call)
}
/// Compare the lower single-precision (32-bit) floating-point element in
/// `a` and `b` based on the comparison operand specified by `imm8`,
/// store the result in the lower element of returned vector,
/// and copy the upper 3 packed elements from `a` to the upper elements of
/// returned vector.
#[inline(always)]
#[target_feature = "+avx,+sse"]
#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
macro_rules! call {
($imm8:expr) => { vcmpss(a, b, $imm8) }
}
constify_imm6!(imm8, call)
}
/// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
/// floating-point elements.
#[inline(always)]
@ -707,6 +853,328 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
}
}
/// Shuffle single-precision (32-bit) floating-point elements in `a`
/// using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx,+sse"]
#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
use x86::sse::_mm_undefined_ps;
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle4 {
($a:expr, $b:expr, $c:expr, $d:expr) => {
simd_shuffle4(a, _mm_undefined_ps(), [
$a, $b, $c, $d
])
}
}
macro_rules! shuffle3 {
($a:expr, $b:expr, $c:expr) => {
match (imm8 >> 6) & 0b11 {
0b00 => shuffle4!($a, $b, $c, 0),
0b01 => shuffle4!($a, $b, $c, 1),
0b10 => shuffle4!($a, $b, $c, 2),
_ => shuffle4!($a, $b, $c, 3),
}
}
}
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
match (imm8 >> 4) & 0b11 {
0b00 => shuffle3!($a, $b, 0),
0b01 => shuffle3!($a, $b, 1),
0b10 => shuffle3!($a, $b, 2),
_ => shuffle3!($a, $b, 3),
}
}
}
macro_rules! shuffle1 {
($a:expr) => {
match (imm8 >> 2) & 0b11 {
0b00 => shuffle2!($a, 0),
0b01 => shuffle2!($a, 1),
0b10 => shuffle2!($a, 2),
_ => shuffle2!($a, 3),
}
}
}
match (imm8 >> 0) & 0b11 {
0b00 => shuffle1!(0),
0b01 => shuffle1!(1),
0b10 => shuffle1!(2),
_ => shuffle1!(3),
}
}
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm256_permutevar_pd(a: f64x4, b: i64x4) -> f64x4 {
vpermilpd256(a, b)
}
/// Shuffle double-precision (64-bit) floating-point elements in `a`
/// using the control in `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm_permutevar_pd(a: f64x2, b: i64x2) -> f64x2 {
vpermilpd(a, b)
}
/// Shuffle double-precision (64-bit) floating-point elements in `a`
/// within 128-bit lanes using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle4 {
($a:expr, $b:expr, $c:expr, $d:expr) => {
simd_shuffle4(a, _mm256_undefined_pd(), [$a, $b, $c, $d]);
}
}
macro_rules! shuffle3 {
($a:expr, $b: expr, $c: expr) => {
match (imm8 >> 3) & 0x1 {
0 => shuffle4!($a, $b, $c, 2),
_ => shuffle4!($a, $b, $c, 3),
}
}
}
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
match (imm8 >> 2) & 0x1 {
0 => shuffle3!($a, $b, 2),
_ => shuffle3!($a, $b, 3),
}
}
}
macro_rules! shuffle1 {
($a:expr) => {
match (imm8 >> 1) & 0x1 {
0 => shuffle2!($a, 0),
_ => shuffle2!($a, 1),
}
}
}
match (imm8 >> 0) & 0x1 {
0 => shuffle1!(0),
_ => shuffle1!(1),
}
}
/// Shuffle double-precision (64-bit) floating-point elements in `a`
/// using the control in `imm8`.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
use x86::sse2::_mm_undefined_pd;
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
simd_shuffle2(a, _mm_undefined_pd(), [$a, $b]);
}
}
macro_rules! shuffle1 {
($a:expr) => {
match (imm8 >> 1) & 0x1 {
0 => shuffle2!($a, 0),
_ => shuffle2!($a, 1),
}
}
}
match (imm8 >> 0) & 0x1 {
0 => shuffle1!(0),
_ => shuffle1!(1),
}
}
/// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
/// floating-point elements) selected by `imm8` from `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
macro_rules! call {
($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
/// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
/// floating-point elements) selected by `imm8` from `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
macro_rules! call {
($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
/// Shuffle 258-bits (composed of integer data) selected by `imm8`
/// from `a` and `b`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
macro_rules! call {
($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
}
constify_imm8!(imm8, call)
}
/// Broadcast a single-precision (32-bit) floating-point element from memory
/// to all elements of the returned vector.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm256_broadcast_ss(f: &f32) -> f32x8 {
f32x8::splat(*f)
}
/// Broadcast a single-precision (32-bit) floating-point element from memory
/// to all elements of the returned vector.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm_broadcast_ss(f: &f32) -> f32x4 {
f32x4::splat(*f)
}
/// Broadcast a double-precision (64-bit) floating-point element from memory
/// to all elements of the returned vector.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm256_broadcast_sd(f: &f64) -> f64x4 {
f64x4::splat(*f)
}
/// Broadcast 128 bits from memory (composed of 4 packed single-precision
/// (32-bit) floating-point elements) to all elements of the returned vector.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vbroadcastf128))]
pub unsafe fn _mm256_broadcast_ps(a: &f32x4) -> f32x8 {
vbroadcastf128ps256(a)
}
/// Broadcast 128 bits from memory (composed of 2 packed double-precision
/// (64-bit) floating-point elements) to all elements of the returned vector.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vbroadcastf128))]
pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 {
vbroadcastf128pd256(a)
}
/// Copy `a` to result, then insert 128 bits (composed of 4 packed
/// single-precision (32-bit) floating-point elements) from `b` into result
/// at the location specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
pub unsafe fn _mm256_insertf128_ps(a: f32x8, b: f32x4, imm8: i32) -> f32x8 {
match imm8 & 1 {
0 => simd_shuffle8(a, _mm256_castps128_ps256(b), [8, 9, 10, 11, 4, 5, 6, 7]),
_ => simd_shuffle8(a, _mm256_castps128_ps256(b), [0, 1, 2, 3, 8, 9, 10, 11]),
}
}
/// Copy `a` to result, then insert 128 bits (composed of 2 packed
/// double-precision (64-bit) floating-point elements) from `b` into result
/// at the location specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
match imm8 & 1 {
0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
_ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
}
}
/// Copy `a` to result, then insert 128 bits from `b` into result
/// at the location specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 {
match imm8 & 1 {
0 => simd_shuffle4(a, _mm256_castsi128_si256(b), [4, 5, 2, 3]),
_ => simd_shuffle4(a, _mm256_castsi128_si256(b), [0, 1, 4, 5]),
}
}
/// Copy `a` to result, and insert the 8-bit integer `i` into result
/// at the location specified by `index`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_insert_epi8(a: i8x32, i: i8, index: i32) -> i8x32 {
let c = a;
c.replace(index as u32 & 31, i)
}
/// Copy `a` to result, and insert the 16-bit integer `i` into result
/// at the location specified by `index`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_insert_epi16(a: i16x16, i: i16, index: i32) -> i16x16 {
let c = a;
c.replace(index as u32 & 15, i)
}
/// Copy `a` to result, and insert the 32-bit integer `i` into result
/// at the location specified by `index`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_insert_epi32(a: i32x8, i: i32, index: i32) -> i32x8 {
let c = a;
c.replace(index as u32 & 7, i)
}
/// Copy `a` to result, and insert the 64-bit integer `i` into result
/// at the location specified by `index`.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_insert_epi64(a: i64x4, i: i64, index: i32) -> i64x4 {
let c = a;
c.replace(index as u32 & 3, i)
}
/// Casts vector of type __m128 to type __m256;
/// the upper 128 bits of the result are undefined.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
// FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
}
/// Casts vector of type __m128d to type __m256d;
/// the upper 128 bits of the result are undefined.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
// FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
simd_shuffle4(a, a, [0, 1, 0, 0])
}
/// Casts vector of type __m128i to type __m256i;
/// the upper 128 bits of the result are undefined.
#[inline(always)]
#[target_feature = "+avx"]
pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 {
// FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
simd_shuffle4(a, a, [0, 1, 0, 0])
}
/// Return vector of type `f32x8` with undefined elements.
#[inline(always)]
#[target_feature = "+avx"]
@ -765,6 +1233,18 @@ extern "C" {
fn vhsubpd(a: f64x4, b: f64x4) -> f64x4;
#[link_name = "llvm.x86.avx.hsub.ps.256"]
fn vhsubps(a: f32x8, b: f32x8) -> f32x8;
#[link_name = "llvm.x86.sse2.cmp.pd"]
fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
#[link_name = "llvm.x86.avx.cmp.pd.256"]
fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4;
#[link_name = "llvm.x86.sse.cmp.ps"]
fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
#[link_name = "llvm.x86.avx.cmp.ps.256"]
fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8;
#[link_name = "llvm.x86.sse2.cmp.sd"]
fn vcmpsd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
#[link_name = "llvm.x86.sse.cmp.ss"]
fn vcmpss(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
#[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
fn vcvtdq2ps(a: i32x8) -> f32x8;
#[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@ -785,6 +1265,20 @@ extern "C" {
fn vpermilps256(a: f32x8, b: i32x8) -> f32x8;
#[link_name = "llvm.x86.avx.vpermilvar.ps"]
fn vpermilps(a: f32x4, b: i32x4) -> f32x4;
#[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
fn vpermilpd256(a: f64x4, b: i64x4) -> f64x4;
#[link_name = "llvm.x86.avx.vpermilvar.pd"]
fn vpermilpd(a: f64x2, b: i64x2) -> f64x2;
#[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8;
#[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4;
#[link_name = "llvm.x86.avx.vperm2f128.si.256"]
fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
#[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
fn vbroadcastf128ps256(a: &f32x4) -> f32x8;
#[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
fn vbroadcastf128pd256(a: &f64x2) -> f64x4;
}
#[cfg(test)]
@ -1176,6 +1670,64 @@ mod tests {
assert_eq!(r, a);
}
#[simd_test = "avx"]
unsafe fn _mm_cmp_pd() {
let a = f64x2::new(4.0, 9.0);
let b = f64x2::new(4.0, 3.0);
let r = avx::_mm_cmp_pd(a, b, avx::_CMP_GE_OS);
assert!(r.extract(0).is_nan());
assert!(r.extract(1).is_nan());
}
#[simd_test = "avx"]
unsafe fn _mm256_cmp_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_cmp_pd(a, b, avx::_CMP_GE_OS);
let e = f64x4::splat(0.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_cmp_ps() {
let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm_cmp_ps(a, b, avx::_CMP_GE_OS);
assert!(r.extract(0).is_nan());
assert_eq!(r.extract(1), 0.0);
assert_eq!(r.extract(2), 0.0);
assert_eq!(r.extract(3), 0.0);
}
#[simd_test = "avx"]
unsafe fn _mm256_cmp_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_cmp_ps(a, b, avx::_CMP_GE_OS);
let e = f32x8::splat(0.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_cmp_sd() {
let a = f64x2::new(4.0, 9.0);
let b = f64x2::new(4.0, 3.0);
let r = avx::_mm_cmp_sd(a, b, avx::_CMP_GE_OS);
assert!(r.extract(0).is_nan());
assert_eq!(r.extract(1), 9.0);
}
#[simd_test = "avx"]
unsafe fn _mm_cmp_ss() {
let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm_cmp_ss(a, b, avx::_CMP_GE_OS);
assert!(r.extract(0).is_nan());
assert_eq!(r.extract(1), 3.0);
assert_eq!(r.extract(2), 2.0);
assert_eq!(r.extract(3), 5.0);
}
#[simd_test = "avx"]
unsafe fn _mm256_cvtepi32_pd() {
let a = i32x4::new(4, 9, 16, 25);
@ -1333,4 +1885,181 @@ mod tests {
let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_permute_ps() {
let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm_permute_ps(a, 0x1b);
let e = f32x4::new(5.0, 2.0, 3.0, 4.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_permutevar_pd() {
let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
let b = i64x4::new(1, 2, 3, 4);
let r = avx::_mm256_permutevar_pd(a, b);
let e = f64x4::new(4.0, 3.0, 5.0, 2.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_permutevar_pd() {
let a = f64x2::new(4.0, 3.0);
let b = i64x2::new(3, 0);
let r = avx::_mm_permutevar_pd(a, b);
let e = f64x2::new(3.0, 4.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_permute_pd() {
let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm256_permute_pd(a, 5);
let e = f64x4::new(3.0, 4.0, 5.0, 2.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_permute_pd() {
let a = f64x2::new(4.0, 3.0);
let r = avx::_mm_permute_pd(a, 1);
let e = f64x2::new(3.0, 4.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_permute2f128_ps() {
let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_permute2f128_ps(a, b, 0x13);
let e = f32x8::new(5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_permute2f128_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
let r = avx::_mm256_permute2f128_pd(a, b, 0x31);
let e = f64x4::new(3.0, 4.0, 7.0, 8.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_permute2f128_si256() {
let a = i32x8::new(1, 2, 3, 4, 1, 2, 3, 4);
let b = i32x8::new(5, 6, 7, 8, 5, 6, 7, 8);
let r = avx::_mm256_permute2f128_si256(a, b, 0x20);
let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_broadcast_ss() {
let r = avx::_mm256_broadcast_ss(&3.0);
let e = f32x8::splat(3.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm_broadcast_ss() {
let r = avx::_mm_broadcast_ss(&3.0);
let e = f32x4::splat(3.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_broadcast_sd() {
let r = avx::_mm256_broadcast_sd(&3.0);
let e = f64x4::splat(3.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_broadcast_ps() {
let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
let r = avx::_mm256_broadcast_ps(&a);
let e = f32x8::new(4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 2.0, 5.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_broadcast_pd() {
let a = f64x2::new(4.0, 3.0);
let r = avx::_mm256_broadcast_pd(&a);
let e = f64x4::new(4.0, 3.0, 4.0, 3.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insertf128_ps() {
let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
let r = avx::_mm256_insertf128_ps(a, b, 0);
let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insertf128_pd() {
let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
let b = f64x2::new(5.0, 6.0);
let r = avx::_mm256_insertf128_pd(a, b, 0);
let e = f64x4::new(5.0, 6.0, 3.0, 4.0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insertf128_si256() {
let a = i64x4::new(1, 2, 3, 4);
let b = i64x2::new(5, 6);
let r = avx::_mm256_insertf128_si256(a, b, 0);
let e = i64x4::new(5, 6, 3, 4);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insert_epi8() {
let a = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32);
let r = avx::_mm256_insert_epi8(a, 0, 31);
let e = i8x32::new(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insert_epi16() {
let a = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
let r = avx::_mm256_insert_epi16(a, 0, 15);
let e = i16x16::new(
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insert_epi32() {
let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx::_mm256_insert_epi32(a, 0, 7);
let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 0);
assert_eq!(r, e);
}
#[simd_test = "avx"]
unsafe fn _mm256_insert_epi64() {
let a = i64x4::new(1, 2, 3, 4);
let r = avx::_mm256_insert_epi64(a, 0, 3);
let e = i64x4::new(1, 2, 3, 0);
assert_eq!(r, e);
}
}

View file

@ -261,3 +261,43 @@ macro_rules! constify_imm8 {
}
}
}
macro_rules! constify_imm6 {
($imm8:expr, $expand:ident) => {
#[allow(overflowing_literals)]
match $imm8 & 0b1_1111 {
0 => $expand!(0),
1 => $expand!(1),
2 => $expand!(2),
3 => $expand!(3),
4 => $expand!(4),
5 => $expand!(5),
6 => $expand!(6),
7 => $expand!(7),
8 => $expand!(8),
9 => $expand!(9),
10 => $expand!(10),
11 => $expand!(11),
12 => $expand!(12),
13 => $expand!(13),
14 => $expand!(14),
15 => $expand!(15),
16 => $expand!(16),
17 => $expand!(17),
18 => $expand!(18),
19 => $expand!(19),
20 => $expand!(20),
21 => $expand!(21),
22 => $expand!(22),
23 => $expand!(23),
24 => $expand!(24),
25 => $expand!(25),
26 => $expand!(26),
27 => $expand!(27),
28 => $expand!(28),
29 => $expand!(29),
30 => $expand!(30),
_ => $expand!(31),
}
}
}

View file

@ -868,6 +868,13 @@ pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) {
pref!(strategy)
}
/// Return vector of type __m128 with undefined elements.
#[inline(always)]
#[target_feature = "+sse"]
pub unsafe fn _mm_undefined_ps() -> f32x4 {
f32x4::splat(mem::uninitialized())
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse.add.ss"]

View file

@ -1827,6 +1827,13 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 {
f64x2::new(d, d)
}
/// Return vector of type __m128d with undefined elements.
#[inline(always)]
#[target_feature = "+sse2"]
pub unsafe fn _mm_undefined_pd() -> f64x2 {
f64x2::splat(mem::uninitialized())
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse2.pause"]