Rework SIMD zeroing
This commit is contained in:
parent
91c0dabca3
commit
4d2911ba4b
20 changed files with 1468 additions and 2906 deletions
|
|
@ -10,6 +10,9 @@ macro_rules! simd_ty {
|
|||
|
||||
#[allow(clippy::use_self)]
|
||||
impl $id {
|
||||
/// A value of this type where all elements are zeroed out.
|
||||
pub(crate) const ZERO: Self = unsafe { crate::mem::zeroed() };
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) const fn new($($param_name: $elem_type),*) -> Self {
|
||||
$id([$($param_name),*])
|
||||
|
|
|
|||
|
|
@ -2232,7 +2232,7 @@ pub fn v128_any_true(a: v128) -> bool {
|
|||
pub fn i8x16_abs(a: v128) -> v128 {
|
||||
unsafe {
|
||||
let a = a.as_i8x16();
|
||||
let zero = simd::i8x16::splat(0);
|
||||
let zero = simd::i8x16::ZERO;
|
||||
simd_select::<simd::m8x16, simd::i8x16>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
|
||||
}
|
||||
}
|
||||
|
|
@ -2524,7 +2524,7 @@ pub use i16x8_extadd_pairwise_u8x16 as u16x8_extadd_pairwise_u8x16;
|
|||
#[stable(feature = "wasm_simd", since = "1.54.0")]
|
||||
pub fn i16x8_abs(a: v128) -> v128 {
|
||||
let a = a.as_i16x8();
|
||||
let zero = simd::i16x8::splat(0);
|
||||
let zero = simd::i16x8::ZERO;
|
||||
unsafe {
|
||||
simd_select::<simd::m16x8, simd::i16x8>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
|
||||
}
|
||||
|
|
@ -3012,7 +3012,7 @@ pub use i32x4_extadd_pairwise_u16x8 as u32x4_extadd_pairwise_u16x8;
|
|||
#[stable(feature = "wasm_simd", since = "1.54.0")]
|
||||
pub fn i32x4_abs(a: v128) -> v128 {
|
||||
let a = a.as_i32x4();
|
||||
let zero = simd::i32x4::splat(0);
|
||||
let zero = simd::i32x4::ZERO;
|
||||
unsafe {
|
||||
simd_select::<simd::m32x4, simd::i32x4>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
|
||||
}
|
||||
|
|
@ -3394,7 +3394,7 @@ pub use i32x4_extmul_high_u16x8 as u32x4_extmul_high_u16x8;
|
|||
#[stable(feature = "wasm_simd", since = "1.54.0")]
|
||||
pub fn i64x2_abs(a: v128) -> v128 {
|
||||
let a = a.as_i64x2();
|
||||
let zero = simd::i64x2::splat(0);
|
||||
let zero = simd::i64x2::ZERO;
|
||||
unsafe {
|
||||
simd_select::<simd::m64x2, simd::i64x2>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
|
||||
}
|
||||
|
|
@ -4105,7 +4105,7 @@ pub fn i32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
|
|||
let ret: simd::i32x4 = unsafe {
|
||||
simd_shuffle!(
|
||||
llvm_i32x2_trunc_sat_f64x2_s(a.as_f64x2()),
|
||||
simd::i32x2::splat(0),
|
||||
simd::i32x2::ZERO,
|
||||
[0, 1, 2, 3],
|
||||
)
|
||||
};
|
||||
|
|
@ -4129,7 +4129,7 @@ pub fn u32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
|
|||
let ret: simd::i32x4 = unsafe {
|
||||
simd_shuffle!(
|
||||
llvm_i32x2_trunc_sat_f64x2_u(a.as_f64x2()),
|
||||
simd::i32x2::splat(0),
|
||||
simd::i32x2::ZERO,
|
||||
[0, 1, 2, 3],
|
||||
)
|
||||
};
|
||||
|
|
@ -4176,7 +4176,7 @@ pub fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
|
|||
unsafe {
|
||||
simd_cast::<simd::f64x4, simd::f32x4>(simd_shuffle!(
|
||||
a.as_f64x2(),
|
||||
simd::f64x2::splat(0.0),
|
||||
simd::f64x2::ZERO,
|
||||
[0, 1, 2, 3]
|
||||
))
|
||||
.v128()
|
||||
|
|
|
|||
|
|
@ -515,7 +515,7 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
|
|||
#[cfg_attr(test, assert_instr(vblendvpd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
|
||||
let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0));
|
||||
let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
|
||||
transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
|
||||
}
|
||||
|
||||
|
|
@ -528,7 +528,7 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
|
|||
#[cfg_attr(test, assert_instr(vblendvps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||||
let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0));
|
||||
let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
|
||||
transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
|
||||
}
|
||||
|
||||
|
|
@ -983,11 +983,7 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM1, 1);
|
||||
let dst: i64x2 = simd_shuffle!(
|
||||
a.as_i64x4(),
|
||||
_mm256_undefined_si256().as_i64x4(),
|
||||
[[0, 1], [2, 3]][IMM1 as usize],
|
||||
);
|
||||
let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
|
||||
transmute(dst)
|
||||
}
|
||||
|
||||
|
|
@ -2139,7 +2135,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
|
|||
pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
|
||||
// Propagate the highest bit to the rest, because simd_bitmask
|
||||
// requires all-1 or all-0.
|
||||
let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0));
|
||||
let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
|
||||
simd_bitmask::<i64x4, u8>(mask).into()
|
||||
}
|
||||
|
||||
|
|
@ -2155,7 +2151,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
|
|||
pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
|
||||
// Propagate the highest bit to the rest, because simd_bitmask
|
||||
// requires all-1 or all-0.
|
||||
let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0));
|
||||
let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
|
||||
simd_bitmask::<i32x8, u8>(mask).into()
|
||||
}
|
||||
|
||||
|
|
@ -2167,7 +2163,7 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
|
|||
#[cfg_attr(test, assert_instr(vxorp))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_setzero_pd() -> __m256d {
|
||||
_mm256_set1_pd(0.0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Returns vector of type __m256 with all elements set to zero.
|
||||
|
|
@ -2178,7 +2174,7 @@ pub unsafe fn _mm256_setzero_pd() -> __m256d {
|
|||
#[cfg_attr(test, assert_instr(vxorps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_setzero_ps() -> __m256 {
|
||||
_mm256_set1_ps(0.0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Returns vector of type __m256i with all elements set to zero.
|
||||
|
|
@ -2189,7 +2185,7 @@ pub unsafe fn _mm256_setzero_ps() -> __m256 {
|
|||
#[cfg_attr(test, assert_instr(vxor))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_setzero_si256() -> __m256i {
|
||||
_mm256_set1_epi8(0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Sets packed double-precision (64-bit) floating-point elements in returned
|
||||
|
|
@ -2722,7 +2718,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
|
||||
let a = a.as_i64x2();
|
||||
let undefined = _mm_undefined_si128().as_i64x2();
|
||||
let undefined = i64x2::ZERO;
|
||||
let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
|
||||
transmute(dst)
|
||||
}
|
||||
|
|
@ -2752,7 +2748,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
|
|||
// instructions, thus it has zero latency.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
|
||||
let b = _mm_setzero_si128().as_i64x2();
|
||||
let b = i64x2::ZERO;
|
||||
let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
|
||||
transmute(dst)
|
||||
}
|
||||
|
|
@ -2782,7 +2778,7 @@ pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
|
|||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_undefined_ps() -> __m256 {
|
||||
_mm256_set1_ps(0.0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Returns vector of type `__m256d` with indeterminate elements.
|
||||
|
|
@ -2795,7 +2791,7 @@ pub unsafe fn _mm256_undefined_ps() -> __m256 {
|
|||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_undefined_pd() -> __m256d {
|
||||
_mm256_set1_pd(0.0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Returns vector of type __m256i with with indeterminate elements.
|
||||
|
|
@ -2808,7 +2804,7 @@ pub unsafe fn _mm256_undefined_pd() -> __m256d {
|
|||
// This intrinsic has no corresponding instruction.
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_undefined_si256() -> __m256i {
|
||||
__m256i([0, 0, 0, 0])
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Sets packed __m256 returned vector with the supplied values.
|
||||
|
|
|
|||
|
|
@ -33,8 +33,7 @@ use stdarch_test::assert_instr;
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
|
||||
let a = a.as_i32x8();
|
||||
let zero = i32x8::splat(0);
|
||||
let r = simd_select::<m32x8, _>(simd_lt(a, zero), simd_neg(a), a);
|
||||
let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
|
||||
transmute(r)
|
||||
}
|
||||
|
||||
|
|
@ -47,8 +46,7 @@ pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
|
||||
let a = a.as_i16x16();
|
||||
let zero = i16x16::splat(0);
|
||||
let r = simd_select::<m16x16, _>(simd_lt(a, zero), simd_neg(a), a);
|
||||
let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
|
||||
transmute(r)
|
||||
}
|
||||
|
||||
|
|
@ -61,8 +59,7 @@ pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
|
||||
let a = a.as_i8x32();
|
||||
let zero = i8x32::splat(0);
|
||||
let r = simd_select::<m8x32, _>(simd_lt(a, zero), simd_neg(a), a);
|
||||
let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
|
||||
transmute(r)
|
||||
}
|
||||
|
||||
|
|
@ -168,12 +165,12 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
|
|||
// If palignr is shifting the pair of vectors more than the size of two
|
||||
// lanes, emit zero.
|
||||
if IMM8 > 32 {
|
||||
return _mm256_set1_epi8(0);
|
||||
return _mm256_setzero_si256();
|
||||
}
|
||||
// If palignr is shifting the pair of input vectors more than one lane,
|
||||
// but less than two lanes, convert to shifting in zeroes.
|
||||
let (a, b) = if IMM8 > 16 {
|
||||
(_mm256_set1_epi8(0), a)
|
||||
(_mm256_setzero_si256(), a)
|
||||
} else {
|
||||
(a, b)
|
||||
};
|
||||
|
|
@ -471,7 +468,7 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
|
|||
#[cfg_attr(test, assert_instr(vpblendvb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
|
||||
let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0));
|
||||
let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
|
||||
transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
|
||||
}
|
||||
|
||||
|
|
@ -484,8 +481,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
|
|||
#[cfg_attr(test, assert_instr(vpbroadcastb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
|
||||
let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
|
||||
transmute::<i8x16, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -498,8 +494,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
|
|||
#[cfg_attr(test, assert_instr(vpbroadcastb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
|
||||
let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
|
||||
transmute::<i8x32, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -514,8 +509,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
|
|||
#[cfg_attr(test, assert_instr(vbroadcastss))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
|
||||
let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
|
||||
transmute::<i32x4, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -530,8 +524,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
|
|||
#[cfg_attr(test, assert_instr(vbroadcastss))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
|
||||
let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
|
||||
transmute::<i32x8, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -595,8 +588,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
|
|||
#[target_feature(enable = "avx2")]
|
||||
#[stable(feature = "simd_x86_updates", since = "1.82.0")]
|
||||
pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
|
||||
let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
|
||||
transmute::<i64x4, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -610,8 +602,7 @@ pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
|
|||
#[target_feature(enable = "avx2")]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
|
||||
let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
|
||||
transmute::<i64x4, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -648,8 +639,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
|
|||
#[cfg_attr(test, assert_instr(vpbroadcastw))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
|
||||
let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
|
||||
transmute::<i16x8, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -662,8 +652,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
|
|||
#[cfg_attr(test, assert_instr(vpbroadcastw))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let ret = simd_shuffle!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
|
||||
let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
|
||||
transmute::<i16x16, _>(ret)
|
||||
}
|
||||
|
||||
|
|
@ -917,7 +906,7 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
|
|||
pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM1, 1);
|
||||
let a = a.as_i64x4();
|
||||
let b = _mm256_undefined_si256().as_i64x4();
|
||||
let b = i64x4::ZERO;
|
||||
let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
|
||||
transmute(dst)
|
||||
}
|
||||
|
|
@ -1005,7 +994,7 @@ pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
|
|||
offsets: __m128i,
|
||||
) -> __m128i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
let zero = i32x4::ZERO;
|
||||
let neg_one = _mm_set1_epi32(-1).as_i32x4();
|
||||
let offsets = offsets.as_i32x4();
|
||||
let slice = slice as *const i8;
|
||||
|
|
@ -1054,7 +1043,7 @@ pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
|
|||
offsets: __m256i,
|
||||
) -> __m256i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
let zero = i32x8::ZERO;
|
||||
let neg_one = _mm256_set1_epi32(-1).as_i32x8();
|
||||
let offsets = offsets.as_i32x8();
|
||||
let slice = slice as *const i8;
|
||||
|
|
@ -1187,7 +1176,7 @@ pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
|
|||
offsets: __m128i,
|
||||
) -> __m128i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
let zero = i64x2::ZERO;
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i64x2();
|
||||
let offsets = offsets.as_i32x4();
|
||||
let slice = slice as *const i8;
|
||||
|
|
@ -1236,7 +1225,7 @@ pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
|
|||
offsets: __m128i,
|
||||
) -> __m256i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
let zero = i64x4::ZERO;
|
||||
let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
|
||||
let offsets = offsets.as_i32x4();
|
||||
let slice = slice as *const i8;
|
||||
|
|
@ -1372,7 +1361,7 @@ pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
|
|||
offsets: __m128i,
|
||||
) -> __m128i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
let zero = i32x4::ZERO;
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i32x4();
|
||||
let offsets = offsets.as_i64x2();
|
||||
let slice = slice as *const i8;
|
||||
|
|
@ -1421,7 +1410,7 @@ pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
|
|||
offsets: __m256i,
|
||||
) -> __m128i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
let zero = i32x4::ZERO;
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i32x4();
|
||||
let offsets = offsets.as_i64x4();
|
||||
let slice = slice as *const i8;
|
||||
|
|
@ -1554,7 +1543,7 @@ pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
|
|||
offsets: __m128i,
|
||||
) -> __m128i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
let zero = i64x2::ZERO;
|
||||
let neg_one = _mm_set1_epi64x(-1).as_i64x2();
|
||||
let slice = slice as *const i8;
|
||||
let offsets = offsets.as_i64x2();
|
||||
|
|
@ -1603,7 +1592,7 @@ pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
|
|||
offsets: __m256i,
|
||||
) -> __m256i {
|
||||
static_assert_imm8_scale!(SCALE);
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
let zero = i64x4::ZERO;
|
||||
let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
|
||||
let slice = slice as *const i8;
|
||||
let offsets = offsets.as_i64x4();
|
||||
|
|
@ -2052,7 +2041,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
|
|||
#[cfg_attr(test, assert_instr(vpmovmskb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
|
||||
let z = i8x32::splat(0);
|
||||
let z = i8x32::ZERO;
|
||||
let m: i8x32 = simd_lt(a.as_i8x32(), z);
|
||||
simd_bitmask::<_, u32>(m) as i32
|
||||
}
|
||||
|
|
@ -2265,7 +2254,7 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
let zero = i64x4::ZERO;
|
||||
let r: i64x4 = simd_shuffle!(
|
||||
a.as_i64x4(),
|
||||
zero,
|
||||
|
|
@ -2670,9 +2659,8 @@ pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
|
|||
}
|
||||
}
|
||||
let a = a.as_i8x32();
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
let r: i8x32 = simd_shuffle!(
|
||||
zero,
|
||||
i8x32::ZERO,
|
||||
a,
|
||||
[
|
||||
mask(IMM8, 0),
|
||||
|
|
@ -2864,7 +2852,7 @@ pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
|
|||
pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let a = a.as_i8x32();
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
let zero = i8x32::ZERO;
|
||||
let r: i8x32 = match IMM8 % 16 {
|
||||
0 => simd_shuffle!(
|
||||
a,
|
||||
|
|
|
|||
|
|
@ -66,8 +66,7 @@ pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __
|
|||
#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
|
||||
pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
|
||||
let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
|
||||
let zero = _mm_setzero_si128().as_u16x8();
|
||||
transmute(simd_select_bitmask(k, cvt, zero))
|
||||
transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
|
||||
|
|
@ -110,8 +109,7 @@ pub unsafe fn _mm256_mask_cvtne2ps_pbh(
|
|||
#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
|
||||
pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
|
||||
let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
|
||||
let zero = _mm256_setzero_si256().as_u16x16();
|
||||
transmute(simd_select_bitmask(k, cvt, zero))
|
||||
transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
|
||||
|
|
@ -156,8 +154,7 @@ pub unsafe fn _mm512_mask_cvtne2ps_pbh(
|
|||
#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
|
||||
pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
|
||||
let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
|
||||
let zero = _mm512_setzero_si512().as_u16x32();
|
||||
transmute(simd_select_bitmask(k, cvt, zero))
|
||||
transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
|
||||
|
|
@ -194,8 +191,7 @@ pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) ->
|
|||
#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
|
||||
pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
|
||||
let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
|
||||
let zero = _mm_setzero_si128().as_u16x8();
|
||||
transmute(simd_select_bitmask(k, cvt, zero))
|
||||
transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
|
||||
|
|
@ -232,8 +228,7 @@ pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) ->
|
|||
#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
|
||||
pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
|
||||
let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
|
||||
let zero = _mm256_setzero_si256().as_u16x16();
|
||||
transmute(simd_select_bitmask(k, cvt, zero))
|
||||
transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
|
||||
}
|
||||
|
||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
|
||||
|
|
@ -314,8 +309,7 @@ pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __
|
|||
#[cfg_attr(test, assert_instr("vdpbf16ps"))]
|
||||
pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
|
||||
let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, rst, zero))
|
||||
transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
|
||||
|
|
@ -362,8 +356,7 @@ pub unsafe fn _mm512_maskz_dpbf16_ps(
|
|||
b: __m512bh,
|
||||
) -> __m512 {
|
||||
let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, rst, zero))
|
||||
transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
|
||||
|
|
@ -400,8 +393,7 @@ pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> _
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
|
||||
let cvt = _mm512_cvtpbh_ps(a);
|
||||
let zero = _mm512_setzero_ps();
|
||||
transmute(simd_select_bitmask(k, cvt.as_f32x16(), zero.as_f32x16()))
|
||||
transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
|
||||
|
|
@ -438,8 +430,7 @@ pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
|
||||
let cvt = _mm256_cvtpbh_ps(a);
|
||||
let zero = _mm256_setzero_ps();
|
||||
transmute(simd_select_bitmask(k, cvt.as_f32x8(), zero.as_f32x8()))
|
||||
transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
|
||||
|
|
@ -476,8 +467,7 @@ pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m12
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
|
||||
let cvt = _mm_cvtpbh_ps(a);
|
||||
let zero = _mm_setzero_ps();
|
||||
transmute(simd_select_bitmask(k, cvt.as_f32x4(), zero.as_f32x4()))
|
||||
transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
|
||||
|
|
|
|||
|
|
@ -7,6 +7,9 @@
|
|||
//!
|
||||
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||||
|
||||
use crate::core_arch::simd::i16x16;
|
||||
use crate::core_arch::simd::i16x32;
|
||||
use crate::core_arch::simd::i16x8;
|
||||
use crate::core_arch::simd::i8x16;
|
||||
use crate::core_arch::simd::i8x32;
|
||||
use crate::core_arch::simd::i8x64;
|
||||
|
|
@ -17,9 +20,6 @@ use crate::core_arch::x86::__mmask16;
|
|||
use crate::core_arch::x86::__mmask32;
|
||||
use crate::core_arch::x86::__mmask64;
|
||||
use crate::core_arch::x86::__mmask8;
|
||||
use crate::core_arch::x86::_mm256_setzero_si256;
|
||||
use crate::core_arch::x86::_mm512_setzero_si512;
|
||||
use crate::core_arch::x86::_mm_setzero_si128;
|
||||
use crate::core_arch::x86::m128iExt;
|
||||
use crate::core_arch::x86::m256iExt;
|
||||
use crate::core_arch::x86::m512iExt;
|
||||
|
|
@ -61,8 +61,11 @@ pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntw))]
|
||||
pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
|
||||
let zero = _mm512_setzero_si512().as_i16x32();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i16x32()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i16x32()),
|
||||
i16x32::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -105,8 +108,11 @@ pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntw))]
|
||||
pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i16x16();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i16x16()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i16x16()),
|
||||
i16x16::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -149,8 +155,11 @@ pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntw))]
|
||||
pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i16x8();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i16x8()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i16x8()),
|
||||
i16x8::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 16-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -193,8 +202,11 @@ pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntb))]
|
||||
pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i8x64()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i8x64()),
|
||||
i8x64::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -237,8 +249,11 @@ pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntb))]
|
||||
pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i8x32()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i8x32()),
|
||||
i8x32::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -281,8 +296,11 @@ pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntb))]
|
||||
pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i8x16()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i8x16()),
|
||||
i8x16::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 8-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -102,8 +102,7 @@ pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i)
|
|||
#[cfg_attr(test, assert_instr(vpconflictd))]
|
||||
pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
|
||||
let conflict = _mm512_conflict_epi32(a).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, conflict, zero))
|
||||
transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
|
||||
|
|
@ -138,8 +137,7 @@ pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i)
|
|||
#[cfg_attr(test, assert_instr(vpconflictd))]
|
||||
pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
|
||||
let conflict = _mm256_conflict_epi32(a).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, conflict, zero))
|
||||
transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
|
||||
|
|
@ -174,8 +172,7 @@ pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
|
|||
#[cfg_attr(test, assert_instr(vpconflictd))]
|
||||
pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let conflict = _mm_conflict_epi32(a).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, conflict, zero))
|
||||
transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
|
||||
|
|
@ -210,8 +207,7 @@ pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i)
|
|||
#[cfg_attr(test, assert_instr(vpconflictq))]
|
||||
pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
|
||||
let conflict = _mm512_conflict_epi64(a).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, conflict, zero))
|
||||
transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
|
||||
|
|
@ -246,8 +242,7 @@ pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i)
|
|||
#[cfg_attr(test, assert_instr(vpconflictq))]
|
||||
pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
|
||||
let conflict = _mm256_conflict_epi64(a).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, conflict, zero))
|
||||
transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
|
||||
|
|
@ -282,8 +277,7 @@ pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
|
|||
#[cfg_attr(test, assert_instr(vpconflictq))]
|
||||
pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let conflict = _mm_conflict_epi64(a).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, conflict, zero))
|
||||
transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
|
||||
|
|
@ -318,8 +312,7 @@ pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) ->
|
|||
#[cfg_attr(test, assert_instr(vplzcntd))]
|
||||
pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
|
||||
let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, zerocount, zero))
|
||||
transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
|
||||
|
|
@ -354,8 +347,7 @@ pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
|
|||
#[cfg_attr(test, assert_instr(vplzcntd))]
|
||||
pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
|
||||
let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, zerocount, zero))
|
||||
transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
|
||||
|
|
@ -390,8 +382,7 @@ pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m
|
|||
#[cfg_attr(test, assert_instr(vplzcntd))]
|
||||
pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, zerocount, zero))
|
||||
transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
|
||||
|
|
@ -426,8 +417,7 @@ pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
|
|||
#[cfg_attr(test, assert_instr(vplzcntq))]
|
||||
pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
|
||||
let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, zerocount, zero))
|
||||
transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
|
||||
|
|
@ -462,8 +452,7 @@ pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
|
|||
#[cfg_attr(test, assert_instr(vplzcntq))]
|
||||
pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
|
||||
let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, zerocount, zero))
|
||||
transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
|
||||
|
|
@ -498,8 +487,7 @@ pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m
|
|||
#[cfg_attr(test, assert_instr(vplzcntq))]
|
||||
pub unsafe fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, zerocount, zero))
|
||||
transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
|
|
|
|||
|
|
@ -30,8 +30,7 @@ pub unsafe fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
|
||||
let and = _mm_and_pd(a, b).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, and, zero))
|
||||
transmute(simd_select_bitmask(k, and, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
|
||||
|
|
@ -58,8 +57,7 @@ pub unsafe fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
|
||||
let and = _mm256_and_pd(a, b).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, and, zero))
|
||||
transmute(simd_select_bitmask(k, and, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
|
||||
|
|
@ -98,8 +96,7 @@ pub unsafe fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
|
||||
let and = _mm512_and_pd(a, b).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, and, zero))
|
||||
transmute(simd_select_bitmask(k, and, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -126,8 +123,7 @@ pub unsafe fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
|
||||
let and = _mm_and_ps(a, b).as_f32x4();
|
||||
let zero = _mm_setzero_ps().as_f32x4();
|
||||
transmute(simd_select_bitmask(k, and, zero))
|
||||
transmute(simd_select_bitmask(k, and, f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -154,8 +150,7 @@ pub unsafe fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
|
||||
let and = _mm256_and_ps(a, b).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, and, zero))
|
||||
transmute(simd_select_bitmask(k, and, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -197,8 +192,7 @@ pub unsafe fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
|
||||
let and = _mm512_and_ps(a, b).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, and, zero))
|
||||
transmute(simd_select_bitmask(k, and, f32x16::ZERO))
|
||||
}
|
||||
|
||||
// Andnot
|
||||
|
|
@ -228,8 +222,7 @@ pub unsafe fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m12
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
|
||||
let andnot = _mm_andnot_pd(a, b).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, andnot, zero))
|
||||
transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
|
||||
|
|
@ -257,8 +250,7 @@ pub unsafe fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
|
||||
let andnot = _mm256_andnot_pd(a, b).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, andnot, zero))
|
||||
transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
|
||||
|
|
@ -298,8 +290,7 @@ pub unsafe fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
|
||||
let andnot = _mm512_andnot_pd(a, b).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, andnot, zero))
|
||||
transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
|
||||
|
|
@ -327,8 +318,7 @@ pub unsafe fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
|
||||
let andnot = _mm_andnot_ps(a, b).as_f32x4();
|
||||
let zero = _mm_setzero_ps().as_f32x4();
|
||||
transmute(simd_select_bitmask(k, andnot, zero))
|
||||
transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
|
||||
|
|
@ -356,8 +346,7 @@ pub unsafe fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m2
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
|
||||
let andnot = _mm256_andnot_ps(a, b).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, andnot, zero))
|
||||
transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
|
||||
|
|
@ -397,8 +386,7 @@ pub unsafe fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
|
||||
let andnot = _mm512_andnot_ps(a, b).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, andnot, zero))
|
||||
transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
|
||||
}
|
||||
|
||||
// Or
|
||||
|
|
@ -427,8 +415,7 @@ pub unsafe fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
|
||||
let or = _mm_or_pd(a, b).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, or, zero))
|
||||
transmute(simd_select_bitmask(k, or, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
|
||||
|
|
@ -455,8 +442,7 @@ pub unsafe fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
|
||||
let or = _mm256_or_pd(a, b).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, or, zero))
|
||||
transmute(simd_select_bitmask(k, or, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
|
||||
|
|
@ -495,8 +481,7 @@ pub unsafe fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
|
||||
let or = _mm512_or_pd(a, b).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, or, zero))
|
||||
transmute(simd_select_bitmask(k, or, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -523,8 +508,7 @@ pub unsafe fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
|
||||
let or = _mm_or_ps(a, b).as_f32x4();
|
||||
let zero = _mm_setzero_ps().as_f32x4();
|
||||
transmute(simd_select_bitmask(k, or, zero))
|
||||
transmute(simd_select_bitmask(k, or, f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -551,8 +535,7 @@ pub unsafe fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
|
||||
let or = _mm256_or_ps(a, b).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, or, zero))
|
||||
transmute(simd_select_bitmask(k, or, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -594,8 +577,7 @@ pub unsafe fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
|
||||
let or = _mm512_or_ps(a, b).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, or, zero))
|
||||
transmute(simd_select_bitmask(k, or, f32x16::ZERO))
|
||||
}
|
||||
|
||||
// Xor
|
||||
|
|
@ -624,8 +606,7 @@ pub unsafe fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
|
||||
let xor = _mm_xor_pd(a, b).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, xor, zero))
|
||||
transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
|
||||
|
|
@ -652,8 +633,7 @@ pub unsafe fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
|
||||
let xor = _mm256_xor_pd(a, b).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, xor, zero))
|
||||
transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
|
||||
|
|
@ -692,8 +672,7 @@ pub unsafe fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
|
||||
let xor = _mm512_xor_pd(a, b).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, xor, zero))
|
||||
transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -720,8 +699,7 @@ pub unsafe fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
|
||||
let xor = _mm_xor_ps(a, b).as_f32x4();
|
||||
let zero = _mm_setzero_ps().as_f32x4();
|
||||
transmute(simd_select_bitmask(k, xor, zero))
|
||||
transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -748,8 +726,7 @@ pub unsafe fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
|
||||
let xor = _mm256_xor_ps(a, b).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, xor, zero))
|
||||
transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
|
||||
|
|
@ -791,8 +768,7 @@ pub unsafe fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
|
||||
let xor = _mm512_xor_ps(a, b).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, xor, zero))
|
||||
transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
|
||||
}
|
||||
|
||||
// Broadcast
|
||||
|
|
@ -832,8 +808,7 @@ pub unsafe fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
|
||||
let b = _mm256_broadcast_f32x2(a).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
|
||||
|
|
@ -871,8 +846,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
|
||||
let b = _mm512_broadcast_f32x2(a).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
|
||||
|
|
@ -908,8 +882,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
|
||||
let b = _mm512_broadcast_f32x8(a).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
|
||||
|
|
@ -945,8 +918,7 @@ pub unsafe fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
|
||||
let b = _mm256_broadcast_f64x2(a).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
|
||||
|
|
@ -982,8 +954,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
|
||||
let b = _mm512_broadcast_f64x2(a).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
|
||||
|
|
@ -1021,8 +992,7 @@ pub unsafe fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let b = _mm_broadcast_i32x2(a).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
|
||||
|
|
@ -1060,8 +1030,7 @@ pub unsafe fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
|
||||
let b = _mm256_broadcast_i32x2(a).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
|
||||
|
|
@ -1099,8 +1068,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
|
||||
let b = _mm512_broadcast_i32x2(a).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the 8 packed 32-bit integers from a to all elements of dst.
|
||||
|
|
@ -1136,8 +1104,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
|
||||
let b = _mm512_broadcast_i32x8(a).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
|
||||
|
|
@ -1173,8 +1140,7 @@ pub unsafe fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
|
||||
let b = _mm256_broadcast_i64x2(a).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
|
||||
|
|
@ -1210,8 +1176,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
|
||||
let b = _mm512_broadcast_i64x2(a).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x8::ZERO))
|
||||
}
|
||||
|
||||
// Extract
|
||||
|
|
@ -1265,8 +1230,7 @@ pub unsafe fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(
|
|||
pub unsafe fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let b = _mm512_extractf32x8_ps::<IMM8>(a);
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, b.as_f32x8(), zero))
|
||||
transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
|
||||
|
|
@ -1318,8 +1282,7 @@ pub unsafe fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
|
|||
pub unsafe fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let b = _mm256_extractf64x2_pd::<IMM8>(a);
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, b.as_f64x2(), zero))
|
||||
transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
|
||||
|
|
@ -1373,8 +1336,7 @@ pub unsafe fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
|
|||
pub unsafe fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
|
||||
static_assert_uimm_bits!(IMM8, 2);
|
||||
let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
|
||||
|
|
@ -1426,8 +1388,7 @@ pub unsafe fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
|
|||
pub unsafe fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
|
||||
|
|
@ -1478,8 +1439,7 @@ pub unsafe fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
|
|||
pub unsafe fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
|
||||
|
|
@ -1532,8 +1492,7 @@ pub unsafe fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
|
|||
pub unsafe fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 2);
|
||||
let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x2::ZERO))
|
||||
}
|
||||
|
||||
// Insert
|
||||
|
|
@ -1601,8 +1560,7 @@ pub unsafe fn _mm512_maskz_insertf32x8<const IMM8: i32>(
|
|||
) -> __m512 {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
|
||||
let zero = _mm512_setzero_ps().as_f32x16();
|
||||
transmute(simd_select_bitmask(k, c, zero))
|
||||
transmute(simd_select_bitmask(k, c, f32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
|
||||
|
|
@ -1660,8 +1618,7 @@ pub unsafe fn _mm256_maskz_insertf64x2<const IMM8: i32>(
|
|||
) -> __m256d {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, c, zero))
|
||||
transmute(simd_select_bitmask(k, c, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
|
||||
|
|
@ -1721,8 +1678,7 @@ pub unsafe fn _mm512_maskz_insertf64x2<const IMM8: i32>(
|
|||
) -> __m512d {
|
||||
static_assert_uimm_bits!(IMM8, 2);
|
||||
let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, c, zero))
|
||||
transmute(simd_select_bitmask(k, c, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the
|
||||
|
|
@ -1790,8 +1746,7 @@ pub unsafe fn _mm512_maskz_inserti32x8<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, c, zero))
|
||||
transmute(simd_select_bitmask(k, c, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
|
||||
|
|
@ -1850,8 +1805,7 @@ pub unsafe fn _mm256_maskz_inserti64x2<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 1);
|
||||
let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, c, zero))
|
||||
transmute(simd_select_bitmask(k, c, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
|
||||
|
|
@ -1912,8 +1866,7 @@ pub unsafe fn _mm512_maskz_inserti64x2<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 2);
|
||||
let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, c, zero))
|
||||
transmute(simd_select_bitmask(k, c, i64x8::ZERO))
|
||||
}
|
||||
|
||||
// Convert
|
||||
|
|
@ -1986,8 +1939,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(
|
|||
) -> __m512d {
|
||||
static_assert_rounding!(ROUNDING);
|
||||
let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2026,8 +1978,7 @@ pub unsafe fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
|
||||
let b = _mm_cvtepi64_pd(a).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2066,8 +2017,7 @@ pub unsafe fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
|
||||
let b = _mm256_cvtepi64_pd(a).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2106,8 +2056,7 @@ pub unsafe fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
|
||||
let b = _mm512_cvtepi64_pd(a).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
|
||||
|
|
@ -2178,8 +2127,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(
|
|||
) -> __m256 {
|
||||
static_assert_rounding!(ROUNDING);
|
||||
let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
|
||||
|
|
@ -2255,8 +2203,7 @@ pub unsafe fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> _
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
|
||||
let b = _mm256_cvtepi64_ps(a).as_f32x4();
|
||||
let zero = _mm_setzero_ps().as_f32x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
|
||||
|
|
@ -2295,8 +2242,7 @@ pub unsafe fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> _
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
|
||||
let b = _mm512_cvtepi64_ps(a).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2367,8 +2313,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(
|
|||
) -> __m512d {
|
||||
static_assert_rounding!(ROUNDING);
|
||||
let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2407,8 +2352,7 @@ pub unsafe fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
|
||||
let b = _mm_cvtepu64_pd(a).as_f64x2();
|
||||
let zero = _mm_setzero_pd().as_f64x2();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2447,8 +2391,7 @@ pub unsafe fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
|
||||
let b = _mm256_cvtepu64_pd(a).as_f64x4();
|
||||
let zero = _mm256_setzero_pd().as_f64x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
|
||||
|
|
@ -2487,8 +2430,7 @@ pub unsafe fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
|
||||
let b = _mm512_cvtepu64_pd(a).as_f64x8();
|
||||
let zero = _mm512_setzero_pd().as_f64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
|
||||
|
|
@ -2559,8 +2501,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(
|
|||
) -> __m256 {
|
||||
static_assert_rounding!(ROUNDING);
|
||||
let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
|
||||
|
|
@ -2636,8 +2577,7 @@ pub unsafe fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> _
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
|
||||
let b = _mm256_cvtepu64_ps(a).as_f32x4();
|
||||
let zero = _mm_setzero_ps().as_f32x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
|
||||
|
|
@ -2676,8 +2616,7 @@ pub unsafe fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> _
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
|
||||
let b = _mm512_cvtepu64_ps(a).as_f32x8();
|
||||
let zero = _mm256_setzero_ps().as_f32x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, f32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
|
||||
|
|
@ -4131,8 +4070,7 @@ pub unsafe fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
|
||||
let b = _mm_mullo_epi64(a, b).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
|
||||
|
|
@ -4177,8 +4115,7 @@ pub unsafe fn _mm256_mask_mullo_epi64(
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
|
||||
let b = _mm256_mullo_epi64(a, b).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
|
||||
|
|
@ -4223,8 +4160,7 @@ pub unsafe fn _mm512_mask_mullo_epi64(
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
|
||||
let b = _mm512_mullo_epi64(a, b).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, b, zero))
|
||||
transmute(simd_select_bitmask(k, b, i64x8::ZERO))
|
||||
}
|
||||
|
||||
// Mask Registers
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -239,7 +239,7 @@ pub unsafe fn _mm512_setr_ph(
|
|||
#[target_feature(enable = "avx512fp16,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm_setzero_ph() -> __m128h {
|
||||
transmute(f16x8::splat(0.0))
|
||||
transmute(f16x8::ZERO)
|
||||
}
|
||||
|
||||
/// Return vector of type __m256h with all elements set to zero.
|
||||
|
|
@ -249,7 +249,7 @@ pub unsafe fn _mm_setzero_ph() -> __m128h {
|
|||
#[target_feature(enable = "avx512fp16,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm256_setzero_ph() -> __m256h {
|
||||
transmute(f16x16::splat(0.0))
|
||||
transmute(f16x16::ZERO)
|
||||
}
|
||||
|
||||
/// Return vector of type __m512h with all elements set to zero.
|
||||
|
|
@ -259,7 +259,7 @@ pub unsafe fn _mm256_setzero_ph() -> __m256h {
|
|||
#[target_feature(enable = "avx512fp16")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm512_setzero_ph() -> __m512h {
|
||||
transmute(f16x32::splat(0.0))
|
||||
transmute(f16x32::ZERO)
|
||||
}
|
||||
|
||||
/// Return vector of type `__m128h` with undefined elements. In practice, this returns the all-zero
|
||||
|
|
@ -270,7 +270,7 @@ pub unsafe fn _mm512_setzero_ph() -> __m512h {
|
|||
#[target_feature(enable = "avx512fp16,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm_undefined_ph() -> __m128h {
|
||||
transmute(f16x8::splat(0.0))
|
||||
transmute(f16x8::ZERO)
|
||||
}
|
||||
|
||||
/// Return vector of type `__m256h` with undefined elements. In practice, this returns the all-zero
|
||||
|
|
@ -281,7 +281,7 @@ pub unsafe fn _mm_undefined_ph() -> __m128h {
|
|||
#[target_feature(enable = "avx512fp16,avx512vl")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm256_undefined_ph() -> __m256h {
|
||||
transmute(f16x16::splat(0.0))
|
||||
transmute(f16x16::ZERO)
|
||||
}
|
||||
|
||||
/// Return vector of type `__m512h` with undefined elements. In practice, this returns the all-zero
|
||||
|
|
@ -292,7 +292,7 @@ pub unsafe fn _mm256_undefined_ph() -> __m256h {
|
|||
#[target_feature(enable = "avx512fp16")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm512_undefined_ph() -> __m512h {
|
||||
transmute(f16x32::splat(0.0))
|
||||
transmute(f16x32::ZERO)
|
||||
}
|
||||
|
||||
/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
|
||||
|
|
@ -15986,7 +15986,7 @@ pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
|
|||
#[target_feature(enable = "avx512fp16")]
|
||||
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
|
||||
pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
|
||||
transmute(simd_insert!(i16x8::splat(0), 0, a))
|
||||
transmute(simd_insert!(i16x8::ZERO, 0, a))
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
|
|
|
|||
|
|
@ -46,8 +46,7 @@ pub unsafe fn _mm512_maskz_permutex2var_epi8(
|
|||
b: __m512i,
|
||||
) -> __m512i {
|
||||
let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
transmute(simd_select_bitmask(k, permute, zero))
|
||||
transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
|
||||
}
|
||||
|
||||
/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
|
||||
|
|
@ -109,8 +108,7 @@ pub unsafe fn _mm256_maskz_permutex2var_epi8(
|
|||
b: __m256i,
|
||||
) -> __m256i {
|
||||
let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
transmute(simd_select_bitmask(k, permute, zero))
|
||||
transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
|
||||
}
|
||||
|
||||
/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
|
||||
|
|
@ -172,8 +170,7 @@ pub unsafe fn _mm_maskz_permutex2var_epi8(
|
|||
b: __m128i,
|
||||
) -> __m128i {
|
||||
let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
transmute(simd_select_bitmask(k, permute, zero))
|
||||
transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
|
||||
}
|
||||
|
||||
/// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
|
||||
|
|
@ -230,8 +227,7 @@ pub unsafe fn _mm512_mask_permutexvar_epi8(
|
|||
#[cfg_attr(test, assert_instr(vpermb))]
|
||||
pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
|
||||
let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
transmute(simd_select_bitmask(k, permute, zero))
|
||||
transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
|
||||
}
|
||||
|
||||
/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
|
||||
|
|
@ -271,8 +267,7 @@ pub unsafe fn _mm256_mask_permutexvar_epi8(
|
|||
#[cfg_attr(test, assert_instr(vpermb))]
|
||||
pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
|
||||
let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
transmute(simd_select_bitmask(k, permute, zero))
|
||||
transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
|
||||
}
|
||||
|
||||
/// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
|
||||
|
|
@ -312,8 +307,7 @@ pub unsafe fn _mm_mask_permutexvar_epi8(
|
|||
#[cfg_attr(test, assert_instr(vpermb))]
|
||||
pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
|
||||
let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
transmute(simd_select_bitmask(k, permute, zero))
|
||||
transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
|
||||
}
|
||||
|
||||
/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
|
||||
|
|
@ -353,8 +347,7 @@ pub unsafe fn _mm512_mask_multishift_epi64_epi8(
|
|||
#[cfg_attr(test, assert_instr(vpmultishiftqb))]
|
||||
pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
|
||||
let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
transmute(simd_select_bitmask(k, multishift, zero))
|
||||
transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
|
||||
}
|
||||
|
||||
/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
|
||||
|
|
@ -394,8 +387,7 @@ pub unsafe fn _mm256_mask_multishift_epi64_epi8(
|
|||
#[cfg_attr(test, assert_instr(vpmultishiftqb))]
|
||||
pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
|
||||
let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
transmute(simd_select_bitmask(k, multishift, zero))
|
||||
transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
|
||||
}
|
||||
|
||||
/// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
|
||||
|
|
@ -435,8 +427,7 @@ pub unsafe fn _mm_mask_multishift_epi64_epi8(
|
|||
#[cfg_attr(test, assert_instr(vpmultishiftqb))]
|
||||
pub unsafe fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
|
||||
let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
transmute(simd_select_bitmask(k, multishift, zero))
|
||||
transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
|
|
|
|||
|
|
@ -247,11 +247,7 @@ pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressw))]
|
||||
pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
|
||||
transmute(vpcompressw(
|
||||
a.as_i16x32(),
|
||||
_mm512_setzero_si512().as_i16x32(),
|
||||
k,
|
||||
))
|
||||
transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k))
|
||||
}
|
||||
|
||||
/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
|
||||
|
|
@ -273,11 +269,7 @@ pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressw))]
|
||||
pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
|
||||
transmute(vpcompressw256(
|
||||
a.as_i16x16(),
|
||||
_mm256_setzero_si256().as_i16x16(),
|
||||
k,
|
||||
))
|
||||
transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k))
|
||||
}
|
||||
|
||||
/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
|
||||
|
|
@ -299,11 +291,7 @@ pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressw))]
|
||||
pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
|
||||
transmute(vpcompressw128(
|
||||
a.as_i16x8(),
|
||||
_mm_setzero_si128().as_i16x8(),
|
||||
k,
|
||||
))
|
||||
transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k))
|
||||
}
|
||||
|
||||
/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
|
||||
|
|
@ -325,11 +313,7 @@ pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressb))]
|
||||
pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
|
||||
transmute(vpcompressb(
|
||||
a.as_i8x64(),
|
||||
_mm512_setzero_si512().as_i8x64(),
|
||||
k,
|
||||
))
|
||||
transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k))
|
||||
}
|
||||
|
||||
/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
|
||||
|
|
@ -351,11 +335,7 @@ pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i)
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressb))]
|
||||
pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
|
||||
transmute(vpcompressb256(
|
||||
a.as_i8x32(),
|
||||
_mm256_setzero_si256().as_i8x32(),
|
||||
k,
|
||||
))
|
||||
transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k))
|
||||
}
|
||||
|
||||
/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
|
||||
|
|
@ -377,11 +357,7 @@ pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpcompressb))]
|
||||
pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
|
||||
transmute(vpcompressb128(
|
||||
a.as_i8x16(),
|
||||
_mm_setzero_si128().as_i8x16(),
|
||||
k,
|
||||
))
|
||||
transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
|
|
@ -403,11 +379,7 @@ pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
|
||||
transmute(vpexpandw(
|
||||
a.as_i16x32(),
|
||||
_mm512_setzero_si512().as_i16x32(),
|
||||
k,
|
||||
))
|
||||
transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
|
|
@ -429,11 +401,7 @@ pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
|
||||
transmute(vpexpandw256(
|
||||
a.as_i16x16(),
|
||||
_mm256_setzero_si256().as_i16x16(),
|
||||
k,
|
||||
))
|
||||
transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
|
|
@ -455,11 +423,7 @@ pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandw))]
|
||||
pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
|
||||
transmute(vpexpandw128(
|
||||
a.as_i16x8(),
|
||||
_mm_setzero_si128().as_i16x8(),
|
||||
k,
|
||||
))
|
||||
transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
|
|
@ -481,11 +445,7 @@ pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
|
||||
transmute(vpexpandb(
|
||||
a.as_i8x64(),
|
||||
_mm512_setzero_si512().as_i8x64(),
|
||||
k,
|
||||
))
|
||||
transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
|
|
@ -507,11 +467,7 @@ pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) ->
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
|
||||
transmute(vpexpandb256(
|
||||
a.as_i8x32(),
|
||||
_mm256_setzero_si256().as_i8x32(),
|
||||
k,
|
||||
))
|
||||
transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k))
|
||||
}
|
||||
|
||||
/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
|
||||
|
|
@ -533,11 +489,7 @@ pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpexpandb))]
|
||||
pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
|
||||
transmute(vpexpandb128(
|
||||
a.as_i8x16(),
|
||||
_mm_setzero_si128().as_i8x16(),
|
||||
k,
|
||||
))
|
||||
transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
|
||||
|
|
@ -572,8 +524,7 @@ pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __
|
|||
#[cfg_attr(test, assert_instr(vpshldvq))]
|
||||
pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
|
||||
let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
|
||||
|
|
@ -608,8 +559,7 @@ pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __
|
|||
#[cfg_attr(test, assert_instr(vpshldvq))]
|
||||
pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
|
||||
let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
|
||||
|
|
@ -644,8 +594,7 @@ pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m12
|
|||
#[cfg_attr(test, assert_instr(vpshldvq))]
|
||||
pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
|
||||
|
|
@ -685,8 +634,7 @@ pub unsafe fn _mm512_maskz_shldv_epi32(
|
|||
c: __m512i,
|
||||
) -> __m512i {
|
||||
let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
|
||||
|
|
@ -721,8 +669,7 @@ pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __
|
|||
#[cfg_attr(test, assert_instr(vpshldvd))]
|
||||
pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
|
||||
let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
|
||||
|
|
@ -757,8 +704,7 @@ pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m12
|
|||
#[cfg_attr(test, assert_instr(vpshldvd))]
|
||||
pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
|
||||
|
|
@ -798,8 +744,7 @@ pub unsafe fn _mm512_maskz_shldv_epi16(
|
|||
c: __m512i,
|
||||
) -> __m512i {
|
||||
let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
|
||||
let zero = _mm512_setzero_si512().as_i16x32();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
|
||||
|
|
@ -839,8 +784,7 @@ pub unsafe fn _mm256_maskz_shldv_epi16(
|
|||
c: __m256i,
|
||||
) -> __m256i {
|
||||
let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
|
||||
let zero = _mm256_setzero_si256().as_i16x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
|
||||
|
|
@ -875,8 +819,7 @@ pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m12
|
|||
#[cfg_attr(test, assert_instr(vpshldvw))]
|
||||
pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
|
||||
let zero = _mm_setzero_si128().as_i16x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
|
||||
|
|
@ -911,8 +854,7 @@ pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __
|
|||
#[cfg_attr(test, assert_instr(vpshrdvq))]
|
||||
pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
|
||||
let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
|
||||
|
|
@ -947,8 +889,7 @@ pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __
|
|||
#[cfg_attr(test, assert_instr(vpshrdvq))]
|
||||
pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
|
||||
let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
|
||||
|
|
@ -983,8 +924,7 @@ pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m12
|
|||
#[cfg_attr(test, assert_instr(vpshrdvq))]
|
||||
pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
|
||||
|
|
@ -1024,8 +964,7 @@ pub unsafe fn _mm512_maskz_shrdv_epi32(
|
|||
c: __m512i,
|
||||
) -> __m512i {
|
||||
let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
|
||||
|
|
@ -1060,8 +999,7 @@ pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __
|
|||
#[cfg_attr(test, assert_instr(vpshrdvd))]
|
||||
pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
|
||||
let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
|
||||
|
|
@ -1096,8 +1034,7 @@ pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m12
|
|||
#[cfg_attr(test, assert_instr(vpshrdvd))]
|
||||
pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
|
||||
|
|
@ -1137,8 +1074,7 @@ pub unsafe fn _mm512_maskz_shrdv_epi16(
|
|||
c: __m512i,
|
||||
) -> __m512i {
|
||||
let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
|
||||
let zero = _mm512_setzero_si512().as_i16x32();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
|
||||
|
|
@ -1178,8 +1114,7 @@ pub unsafe fn _mm256_maskz_shrdv_epi16(
|
|||
c: __m256i,
|
||||
) -> __m256i {
|
||||
let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
|
||||
let zero = _mm256_setzero_si256().as_i16x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
|
||||
|
|
@ -1214,8 +1149,7 @@ pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m12
|
|||
#[cfg_attr(test, assert_instr(vpshrdvw))]
|
||||
pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
|
||||
let zero = _mm_setzero_si128().as_i16x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
|
||||
|
|
@ -1265,8 +1199,7 @@ pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
|
||||
|
|
@ -1316,8 +1249,7 @@ pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
|
||||
|
|
@ -1367,8 +1299,7 @@ pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
|
||||
|
|
@ -1418,8 +1349,7 @@ pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
|
||||
|
|
@ -1469,8 +1399,7 @@ pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
|
||||
|
|
@ -1520,8 +1449,7 @@ pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
|
||||
|
|
@ -1571,8 +1499,7 @@ pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
|
||||
let zero = _mm512_setzero_si512().as_i16x32();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
|
||||
|
|
@ -1622,8 +1549,7 @@ pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
|
||||
let zero = _mm256_setzero_si256().as_i16x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
|
||||
|
|
@ -1673,8 +1599,7 @@ pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
|
||||
let zero = _mm_setzero_si128().as_i16x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
|
||||
|
|
@ -1724,8 +1649,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
|
||||
|
|
@ -1775,8 +1699,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
|
||||
|
|
@ -1826,8 +1749,7 @@ pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
|
||||
|
|
@ -1877,8 +1799,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
|
||||
|
|
@ -1928,8 +1849,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
|
||||
|
|
@ -1979,8 +1899,7 @@ pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
|
||||
|
|
@ -2030,8 +1949,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
|
||||
let zero = _mm512_setzero_si512().as_i16x32();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
|
||||
|
|
@ -2081,8 +1999,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
|
||||
let zero = _mm256_setzero_si256().as_i16x16();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
|
||||
}
|
||||
|
||||
/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
|
||||
|
|
@ -2132,8 +2049,7 @@ pub unsafe fn _mm_maskz_shrdi_epi16<const IMM8: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(IMM8, 8);
|
||||
let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
|
||||
let zero = _mm_setzero_si128().as_i16x8();
|
||||
transmute(simd_select_bitmask(k, shf, zero))
|
||||
transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
|
|
|
|||
|
|
@ -46,8 +46,7 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
|
|||
b: __m512i,
|
||||
) -> __m512i {
|
||||
let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
|
|
@ -106,8 +105,7 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
|
|||
b: __m256i,
|
||||
) -> __m256i {
|
||||
let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
|
|
@ -156,8 +154,7 @@ pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
|
|||
#[cfg_attr(test, assert_instr(vpdpwssd))]
|
||||
pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
|
|
@ -202,8 +199,7 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
|
|||
b: __m512i,
|
||||
) -> __m512i {
|
||||
let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
|
|
@ -262,8 +258,7 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
|
|||
b: __m256i,
|
||||
) -> __m256i {
|
||||
let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
|
|
@ -317,8 +312,7 @@ pub unsafe fn _mm_maskz_dpwssds_epi32(
|
|||
b: __m128i,
|
||||
) -> __m128i {
|
||||
let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
|
|
@ -363,8 +357,7 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
|
|||
b: __m512i,
|
||||
) -> __m512i {
|
||||
let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
|
|
@ -423,8 +416,7 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
|
|||
b: __m256i,
|
||||
) -> __m256i {
|
||||
let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
|
|
@ -473,8 +465,7 @@ pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
|
|||
#[cfg_attr(test, assert_instr(vpdpbusd))]
|
||||
pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
|
|
@ -519,8 +510,7 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
|
|||
b: __m512i,
|
||||
) -> __m512i {
|
||||
let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x16::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
|
|
@ -579,8 +569,7 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
|
|||
b: __m256i,
|
||||
) -> __m256i {
|
||||
let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x8::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
|
|
@ -634,8 +623,7 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
|
|||
b: __m128i,
|
||||
) -> __m128i {
|
||||
let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
transmute(simd_select_bitmask(k, r, i32x4::ZERO))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
|
||||
|
|
|
|||
|
|
@ -7,14 +7,12 @@
|
|||
//!
|
||||
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||||
|
||||
use crate::core_arch::simd::*;
|
||||
use crate::core_arch::x86::__m128i;
|
||||
use crate::core_arch::x86::__m256i;
|
||||
use crate::core_arch::x86::__m512i;
|
||||
use crate::core_arch::x86::__mmask16;
|
||||
use crate::core_arch::x86::__mmask8;
|
||||
use crate::core_arch::x86::_mm256_setzero_si256;
|
||||
use crate::core_arch::x86::_mm512_setzero_si512;
|
||||
use crate::core_arch::x86::_mm_setzero_si128;
|
||||
use crate::core_arch::x86::m128iExt;
|
||||
use crate::core_arch::x86::m256iExt;
|
||||
use crate::core_arch::x86::m512iExt;
|
||||
|
|
@ -46,8 +44,11 @@ pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntd))]
|
||||
pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
|
||||
let zero = _mm512_setzero_si512().as_i32x16();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i32x16()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i32x16()),
|
||||
i32x16::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -90,8 +91,11 @@ pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntd))]
|
||||
pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i32x8();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i32x8()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i32x8()),
|
||||
i32x8::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -134,8 +138,11 @@ pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntd))]
|
||||
pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i32x4();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i32x4()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i32x4()),
|
||||
i32x4::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 32-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -178,8 +185,11 @@ pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntq))]
|
||||
pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
|
||||
let zero = _mm512_setzero_si512().as_i64x8();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i64x8()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i64x8()),
|
||||
i64x8::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -222,8 +232,11 @@ pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntq))]
|
||||
pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i64x4();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i64x4()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i64x4()),
|
||||
i64x4::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
@ -266,8 +279,11 @@ pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vpopcntq))]
|
||||
pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i64x2();
|
||||
transmute(simd_select_bitmask(k, simd_ctpop(a.as_i64x2()), zero))
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
simd_ctpop(a.as_i64x2()),
|
||||
i64x2::ZERO,
|
||||
))
|
||||
}
|
||||
|
||||
/// For each packed 64-bit integer maps the value to the number of logical 1 bits.
|
||||
|
|
|
|||
|
|
@ -16,9 +16,6 @@ use crate::core_arch::x86::__m512i;
|
|||
use crate::core_arch::x86::__mmask16;
|
||||
use crate::core_arch::x86::__mmask32;
|
||||
use crate::core_arch::x86::__mmask64;
|
||||
use crate::core_arch::x86::_mm256_setzero_si256;
|
||||
use crate::core_arch::x86::_mm512_setzero_si512;
|
||||
use crate::core_arch::x86::_mm_setzero_si128;
|
||||
use crate::core_arch::x86::m128iExt;
|
||||
use crate::core_arch::x86::m256iExt;
|
||||
use crate::core_arch::x86::m512iExt;
|
||||
|
|
@ -110,7 +107,7 @@ pub unsafe fn _mm512_mask_gf2p8mul_epi8(
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vgf2p8mulb))]
|
||||
pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
let zero = i8x64::ZERO;
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
|
||||
|
|
@ -169,7 +166,7 @@ pub unsafe fn _mm256_mask_gf2p8mul_epi8(
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vgf2p8mulb))]
|
||||
pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
let zero = i8x32::ZERO;
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
|
||||
|
|
@ -228,7 +225,7 @@ pub unsafe fn _mm_mask_gf2p8mul_epi8(
|
|||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(test, assert_instr(vgf2p8mulb))]
|
||||
pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
let zero = i8x16::ZERO;
|
||||
transmute(simd_select_bitmask(
|
||||
k,
|
||||
vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
|
||||
|
|
@ -277,7 +274,7 @@ pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(B, 8);
|
||||
let b = B as u8;
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
let zero = i8x64::ZERO;
|
||||
let x = x.as_i8x64();
|
||||
let a = a.as_i8x64();
|
||||
let r = vgf2p8affineqb_512(x, a, b);
|
||||
|
|
@ -353,7 +350,7 @@ pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(B, 8);
|
||||
let b = B as u8;
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
let zero = i8x32::ZERO;
|
||||
let x = x.as_i8x32();
|
||||
let a = a.as_i8x32();
|
||||
let r = vgf2p8affineqb_256(x, a, b);
|
||||
|
|
@ -429,7 +426,7 @@ pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(B, 8);
|
||||
let b = B as u8;
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
let zero = i8x16::ZERO;
|
||||
let x = x.as_i8x16();
|
||||
let a = a.as_i8x16();
|
||||
let r = vgf2p8affineqb_128(x, a, b);
|
||||
|
|
@ -509,7 +506,7 @@ pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
|
|||
) -> __m512i {
|
||||
static_assert_uimm_bits!(B, 8);
|
||||
let b = B as u8;
|
||||
let zero = _mm512_setzero_si512().as_i8x64();
|
||||
let zero = i8x64::ZERO;
|
||||
let x = x.as_i8x64();
|
||||
let a = a.as_i8x64();
|
||||
let r = vgf2p8affineinvqb_512(x, a, b);
|
||||
|
|
@ -591,7 +588,7 @@ pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
|
|||
) -> __m256i {
|
||||
static_assert_uimm_bits!(B, 8);
|
||||
let b = B as u8;
|
||||
let zero = _mm256_setzero_si256().as_i8x32();
|
||||
let zero = i8x32::ZERO;
|
||||
let x = x.as_i8x32();
|
||||
let a = a.as_i8x32();
|
||||
let r = vgf2p8affineinvqb_256(x, a, b);
|
||||
|
|
@ -673,7 +670,7 @@ pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
|
|||
) -> __m128i {
|
||||
static_assert_uimm_bits!(B, 8);
|
||||
let b = B as u8;
|
||||
let zero = _mm_setzero_si128().as_i8x16();
|
||||
let zero = i8x16::ZERO;
|
||||
let x = x.as_i8x16();
|
||||
let a = a.as_i8x16();
|
||||
let r = vgf2p8affineinvqb_128(x, a, b);
|
||||
|
|
|
|||
|
|
@ -983,7 +983,7 @@ pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
|
|||
#[cfg_attr(test, assert_instr(xorps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_setzero_ps() -> __m128 {
|
||||
__m128([0.0, 0.0, 0.0, 0.0])
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// A utility function for creating masks to use with Intel shuffle and
|
||||
|
|
@ -1089,7 +1089,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
|
|||
pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
|
||||
// Propagate the highest bit to the rest, because simd_bitmask
|
||||
// requires all-1 or all-0.
|
||||
let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0));
|
||||
let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
|
||||
simd_bitmask::<i32x4, u8>(mask).into()
|
||||
}
|
||||
|
||||
|
|
@ -1881,7 +1881,7 @@ pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
|
|||
#[target_feature(enable = "sse")]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_undefined_ps() -> __m128 {
|
||||
_mm_set1_ps(0.0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
|
||||
|
|
|
|||
|
|
@ -455,9 +455,8 @@ unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
|
|||
16 - shift + i
|
||||
}
|
||||
}
|
||||
let zero = _mm_set1_epi8(0).as_i8x16();
|
||||
transmute::<i8x16, _>(simd_shuffle!(
|
||||
zero,
|
||||
i8x16::ZERO,
|
||||
a.as_i8x16(),
|
||||
[
|
||||
mask(IMM8, 0),
|
||||
|
|
@ -670,10 +669,9 @@ unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
|
|||
i + (shift as u32)
|
||||
}
|
||||
}
|
||||
let zero = _mm_set1_epi8(0).as_i8x16();
|
||||
let x: i8x16 = simd_shuffle!(
|
||||
a.as_i8x16(),
|
||||
zero,
|
||||
i8x16::ZERO,
|
||||
[
|
||||
mask(IMM8, 0),
|
||||
mask(IMM8, 1),
|
||||
|
|
@ -1191,7 +1189,7 @@ pub unsafe fn _mm_setr_epi8(
|
|||
#[cfg_attr(test, assert_instr(xorps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_setzero_si128() -> __m128i {
|
||||
_mm_set1_epi64x(0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Loads 64-bit integer from memory into first element of returned vector.
|
||||
|
|
@ -1359,8 +1357,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
|
|||
)]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
|
||||
let zero = _mm_setzero_si128();
|
||||
let r: i64x2 = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
|
||||
let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
|
||||
transmute(r)
|
||||
}
|
||||
|
||||
|
|
@ -1434,7 +1431,7 @@ pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
|
|||
#[cfg_attr(test, assert_instr(pmovmskb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
|
||||
let z = i8x16::splat(0);
|
||||
let z = i8x16::ZERO;
|
||||
let m: i8x16 = simd_lt(a.as_i8x16(), z);
|
||||
simd_bitmask::<_, u16>(m) as u32 as i32
|
||||
}
|
||||
|
|
@ -2267,7 +2264,7 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
|
||||
let r = simd_cast::<_, f32x2>(a.as_f64x2());
|
||||
let zero = f32x2::new(0.0, 0.0);
|
||||
let zero = f32x2::ZERO;
|
||||
transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
|
||||
}
|
||||
|
||||
|
|
@ -2447,7 +2444,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
|
|||
#[cfg_attr(test, assert_instr(xorp))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_setzero_pd() -> __m128d {
|
||||
_mm_set_pd(0.0, 0.0)
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Returns a mask of the most significant bit of each element in `a`.
|
||||
|
|
@ -2463,7 +2460,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
|
|||
pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
|
||||
// Propagate the highest bit to the rest, because simd_bitmask
|
||||
// requires all-1 or all-0.
|
||||
let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0));
|
||||
let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
|
||||
simd_bitmask::<i64x2, u8>(mask).into()
|
||||
}
|
||||
|
||||
|
|
@ -2902,7 +2899,7 @@ pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
|
|||
#[target_feature(enable = "sse2")]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_undefined_pd() -> __m128d {
|
||||
__m128d([0.0, 0.0])
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// Returns vector of type __m128i with indeterminate elements.
|
||||
|
|
@ -2914,7 +2911,7 @@ pub unsafe fn _mm_undefined_pd() -> __m128d {
|
|||
#[target_feature(enable = "sse2")]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_undefined_si128() -> __m128i {
|
||||
__m128i([0, 0])
|
||||
const { mem::zeroed() }
|
||||
}
|
||||
|
||||
/// The resulting `__m128d` element is composed by the low-order values of
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI
|
|||
#[cfg_attr(test, assert_instr(pblendvb))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
|
||||
let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0));
|
||||
let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
|
||||
transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
|
||||
}
|
||||
|
||||
|
|
@ -103,7 +103,7 @@ pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
|
|||
#[cfg_attr(test, assert_instr(blendvpd))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
|
||||
let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0));
|
||||
let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
|
||||
transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
|
||||
}
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
|
|||
#[cfg_attr(test, assert_instr(blendvps))]
|
||||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
|
||||
let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0));
|
||||
let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
|
||||
transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ use stdarch_test::assert_instr;
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
|
||||
let a = a.as_i8x16();
|
||||
let zero = i8x16::splat(0);
|
||||
let zero = i8x16::ZERO;
|
||||
let r = simd_select::<m8x16, _>(simd_lt(a, zero), simd_neg(a), a);
|
||||
transmute(r)
|
||||
}
|
||||
|
|
@ -34,7 +34,7 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
|
||||
let a = a.as_i16x8();
|
||||
let zero = i16x8::splat(0);
|
||||
let zero = i16x8::ZERO;
|
||||
let r = simd_select::<m16x8, _>(simd_lt(a, zero), simd_neg(a), a);
|
||||
transmute(r)
|
||||
}
|
||||
|
|
@ -50,7 +50,7 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
|
|||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||||
pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
|
||||
let a = a.as_i32x4();
|
||||
let zero = i32x4::splat(0);
|
||||
let zero = i32x4::ZERO;
|
||||
let r = simd_select::<m32x4, _>(simd_lt(a, zero), simd_neg(a), a);
|
||||
transmute(r)
|
||||
}
|
||||
|
|
@ -103,12 +103,12 @@ pub unsafe fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
|
|||
// If palignr is shifting the pair of vectors more than the size of two
|
||||
// lanes, emit zero.
|
||||
if IMM8 > 32 {
|
||||
return _mm_set1_epi8(0);
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
// If palignr is shifting the pair of input vectors more than one lane,
|
||||
// but less than two lanes, convert to shifting in zeroes.
|
||||
let (a, b) = if IMM8 > 16 {
|
||||
(_mm_set1_epi8(0), a)
|
||||
(_mm_setzero_si128(), a)
|
||||
} else {
|
||||
(a, b)
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue