Another batch of AVX2 intrinsics (#4)

add more avx2 intrinsics
This commit is contained in:
Jack Mott 2017-05-29 09:36:08 -05:00 committed by Andrew Gallant
parent 0b760d7e20
commit 59aa3f75ac
2 changed files with 758 additions and 32 deletions

View file

@ -811,8 +811,8 @@ avx2
* [ ] `_mm_i64gather_epi64`
* [ ] `_mm256_i64gather_epi64`
* [ ] `_mm256_inserti128_si256`
* [ ] `_mm256_madd_epi16`
* [ ] `_mm256_maddubs_epi16`
* [x] `_mm256_madd_epi16`
* [x] `_mm256_maddubs_epi16`
* [ ] `_mm_mask_i32gather_pd`
* [ ] `_mm256_mask_i32gather_pd`
* [ ] `_mm_mask_i32gather_ps`
@ -837,45 +837,45 @@ avx2
* [ ] `_mm256_maskstore_epi32`
* [ ] `_mm_maskstore_epi64`
* [ ] `_mm256_maskstore_epi64`
* [ ] `_mm256_max_epi8`
* [ ] `_mm256_max_epi16`
* [ ] `_mm256_max_epi32`
* [ ] `_mm256_max_epu8`
* [ ] `_mm256_max_epu16`
* [ ] `_mm256_max_epu32`
* [ ] `_mm256_min_epi8`
* [ ] `_mm256_min_epi16`
* [ ] `_mm256_min_epi32`
* [ ] `_mm256_min_epu8`
* [ ] `_mm256_min_epu16`
* [ ] `_mm256_min_epu32`
* [x] `_mm256_max_epi8`
* [x] `_mm256_max_epi16`
* [x] `_mm256_max_epi32`
* [x] `_mm256_max_epu8`
* [x] `_mm256_max_epu16`
* [x] `_mm256_max_epu32`
* [x] `_mm256_min_epi8`
* [x] `_mm256_min_epi16`
* [x] `_mm256_min_epi32`
* [x] `_mm256_min_epu8`
* [x] `_mm256_min_epu16`
* [x] `_mm256_min_epu32`
* [ ] `_mm256_movemask_epi8`
* [ ] `_mm256_mpsadbw_epu8`
* [ ] `_mm256_mul_epi32`
* [ ] `_mm256_mul_epu32`
* [ ] `_mm256_mulhi_epi16`
* [ ] `_mm256_mulhi_epu16`
* [ ] `_mm256_mulhrs_epi16`
* [ ] `_mm256_mullo_epi16`
* [ ] `_mm256_mullo_epi32`
* [ ] `_mm256_or_si256`
* [ ] `_mm256_packs_epi16`
* [ ] `_mm256_packs_epi32`
* [ ] `_mm256_packus_epi16`
* [ ] `_mm256_packus_epi32`
* [x] `_mm256_mul_epi32`
* [x] `_mm256_mul_epu32`
* [x] `_mm256_mulhi_epi16`
* [x] `_mm256_mulhi_epu16`
* [x] `_mm256_mulhrs_epi16`
* [x] `_mm256_mullo_epi16`
* [x] `_mm256_mullo_epi32`
* [x] `_mm256_or_si256`
* [x] `_mm256_packs_epi16`
* [x] `_mm256_packs_epi32`
* [x] `_mm256_packus_epi16`
* [x] `_mm256_packus_epi32`
* [ ] `_mm256_permute2x128_si256`
* [ ] `_mm256_permute4x64_epi64`
* [ ] `_mm256_permute4x64_pd`
* [ ] `_mm256_permutevar8x32_epi32`
* [ ] `_mm256_permutevar8x32_ps`
* [ ] `_mm256_sad_epu8`
* [x] `_mm256_sad_epu8`
* [ ] `_mm256_shuffle_epi32`
* [ ] `_mm256_shuffle_epi8`
* [ ] `_mm256_shufflehi_epi16`
* [ ] `_mm256_shufflelo_epi16`
* [ ] `_mm256_sign_epi8`
* [ ] `_mm256_sign_epi16`
* [ ] `_mm256_sign_epi32`
* [x] `_mm256_sign_epi8`
* [x] `_mm256_sign_epi16`
* [x] `_mm256_sign_epi32`
* [ ] `_mm256_slli_si256`
* [ ] `_mm256_bslli_epi128`
* [ ] `_mm256_sll_epi16`

View file

@ -78,6 +78,8 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { paddusw(a, b) }
}
// TODO _mm256_alignr_epi8
/// Compute the bitwise AND of 256 bits (representing integer data)
/// in `a` and `b`.
#[inline(always)]
@ -108,7 +110,9 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
unsafe { pavgb(a, b) }
}
// TODO _mm256_alignr_epi8
// TODO _mm256_blend_epi16
// TODO _mm_blend_epi32
// TODO _mm256_blend_epi32
@ -252,6 +256,338 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { phsubsw(a, b) }
}
// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)
// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)
// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)
// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)
// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)
// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)
// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)
// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)
// TODO _mm256_mask_i64gather_ps
// TODO _mm256_inserti128_si256
/// Multiply packed signed 16-bit integers in `a` and `b`, producing
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
/// of intermediate 32-bit integers.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 {
unsafe { pmaddwd(a, b) }
}
/// Vertically multiply each unsigned 8-bit integer from `a` with the
/// corresponding signed 8-bit integer from `b`, producing intermediate
/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
/// signed 16-bit integers
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 {
unsafe { pmaddubsw(a, b) }
}
// TODO _mm_maskload_epi32 (int const* mem_addr, __m128i mask)
// TODO _mm256_maskload_epi32 (int const* mem_addr, __m256i mask)
// TODO _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask)
// TODO _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask)
// TODO _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a)
// TODO _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
// TODO _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a)
// TODO _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a)
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
/// maximum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { pmaxsw(a, b) }
}
/// Compare packed 32-bit integers in `a` and `b`, and return the packed
/// maximum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { pmaxsd(a, b) }
}
/// Compare packed 8-bit integers in `a` and `b`, and return the packed
/// maximum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { pmaxsb(a, b) }
}
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return
/// the packed maximum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { pmaxuw(a, b) }
}
/// Compare packed unsigned 32-bit integers in `a` and `b`, and return
/// the packed maximum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 {
unsafe { pmaxud(a, b) }
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return
/// the packed maximum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 {
unsafe { pmaxub(a, b) }
}
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
/// minimum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { pminsw(a, b) }
}
/// Compare packed 32-bit integers in `a` and `b`, and return the packed
/// minimum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { pminsd(a, b) }
}
/// Compare packed 8-bit integers in `a` and `b`, and return the packed
/// minimum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { pminsb(a, b) }
}
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return
/// the packed minimum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { pminuw(a, b) }
}
/// Compare packed unsigned 32-bit integers in `a` and `b`, and return
/// the packed minimum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 {
unsafe { pminud(a, b) }
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return
/// the packed minimum values.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 {
unsafe { pminub(a, b) }
}
/*** The following two functions fail in debug, but work in release
/// Create mask from the most significant bit of each 8-bit element in `a`,
/// return the result.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_movemask_epi8(a: i8x32) -> i32 {
unsafe { pmovmskb(a) }
}
/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit
/// results in dst. Eight SADs are performed for each 128-bit lane using one
/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
/// selected from `b` starting at on the offset specified in `imm8`. Eight
/// quadruplets are formed from sequential 8-bit integers selected from `a`
/// starting at the offset specified in `imm8`.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
unsafe { mpsadbw(a, b, imm8) }
}
***/
/// Multiply the low 32-bit integers from each packed 64-bit element in
/// `a` and `b`
///
/// Return the 64-bit results.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
unsafe { pmuldq(a, b) }
}
/// Multiply the low unsigned 32-bit integers from each packed 64-bit
/// element in `a` and `b`
///
/// Return the unsigned 64-bit results.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
unsafe { pmuludq(a, b) }
}
/// Multiply the packed 16-bit integers in `a` and `b`, producing
/// intermediate 32-bit integers and returning the high 16 bits of the
/// intermediate integers.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { pmulhw(a, b) }
}
/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
/// intermediate 32-bit integers and returning the high 16 bits of the
/// intermediate integers.
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
unsafe { pmulhuw(a, b) }
}
/// Multiply the packed 16-bit integers in `a` and `b`, producing
/// intermediate 32-bit integers, and return the low 16 bits of the
/// intermediate integers
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
a * b
}
/// Multiply the packed 32-bit integers in `a` and `b`, producing
/// intermediate 64-bit integers, and return the low 16 bits of the
/// intermediate integers
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
a * b
}
/// Multiply packed 16-bit integers in `a` and `b`, producing
/// intermediate signed 32-bit integers. Truncate each intermediate
/// integer to the 18 most significant bits, round by adding 1, and
/// return bits [16:1]
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
unsafe { pmulhrsw(a, b) }
}
/// Compute the bitwise OR of 256 bits (representing integer data) in `a`
/// and `b`
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
a | b
}
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
/// using signed saturation
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
unsafe { packsswb(a, b) }
}
/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
/// using signed saturation
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 {
unsafe { packssdw(a, b) }
}
/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
/// using unsigned saturation
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 {
unsafe { packuswb(a, b) }
}
/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
/// using unsigned saturation
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
unsafe { packusdw(a, b) }
}
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
// TODO _mm256_permute4x64_epi64 (__m256i a, const int imm8)
// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
// TODO _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)
// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
/// and `b`, then horizontally sum each consecutive 8 differences to
/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
/// integers in the low 16 bits of the 64-bit return value
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
unsafe { psadbw(a, b) }
}
// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8)
// TODO _mm256_shuffle_epi8 (__m256i a, __m256i b)
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 {
unsafe { psignw(a, b) }
}
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 {
unsafe { psignd(a, b) }
}
#[inline(always)]
#[target_feature = "+avx2"]
pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
unsafe { psignb(a, b) }
}
#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx2.pabs.b"]
@ -267,7 +603,7 @@ extern "C" {
#[link_name = "llvm.x86.avx2.paddus.b"]
fn paddusb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.paddus.w"]
fn paddusw(a: u16x16, b: u16x16) -> u16x16;
fn paddusw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pavg.b"]
fn pavgb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.pavg.w"]
@ -286,6 +622,65 @@ extern "C" {
fn phsubd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.phsub.sw"]
fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmadd.wd"]
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
#[link_name = "llvm.x86.avx2.pmaxs.w"]
fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmaxs.d"]
fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.pmaxs.b"]
fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.pmaxu.w"]
fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pmaxu.d"]
fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.pmaxu.b"]
fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.pmins.w"]
fn pminsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmins.d"]
fn pminsd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.pmins.b"]
fn pminsb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.pminu.w"]
fn pminuw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pminu.d"]
fn pminud(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.pminu.b"]
fn pminub(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.pmovmskb"] //fails in debug
fn pmovmskb(a: i8x32) -> i32;
#[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug
fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
#[link_name = "llvm.x86.avx2.pmulhu.w"]
fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pmulh.w"]
fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmul.dq"]
fn pmuldq(a: i32x8, b:i32x8) -> i64x4;
#[link_name = "llvm.x86.avx2.pmulu.dq"]
fn pmuludq(a: u32x8, b:u32x8) -> u64x4;
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]
fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.packsswb"]
fn packsswb(a: i16x16, b: i16x16) -> i8x32;
#[link_name = "llvm.x86.avx2.packssdw"]
fn packssdw(a: i32x8, b: i32x8) -> i16x16;
#[link_name = "llvm.x86.avx2.packuswb"]
fn packuswb(a: i16x16, b: i16x16) -> u8x32;
#[link_name = "llvm.x86.avx2.packusdw"]
fn packusdw(a: i32x8, b: i32x8) -> u16x16;
#[link_name = "llvm.x86.avx2.psad.bw"]
fn psadbw(a: u8x32, b: u8x32) -> u64x4;
#[link_name = "llvm.x86.avx2.psign.b"]
fn psignb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.psign.w"]
fn psignw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.psign.d"]
fn psignd(a: i32x8, b: i32x8) -> i32x8;
}
@ -695,4 +1090,335 @@ mod tests {
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_madd_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_madd_epi16(a, b);
let e = i32x8::splat(16);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_maddubs_epi16() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_maddubs_epi16(a, b);
let e = i16x16::splat(16);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_max_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_max_epi16(a, b);
assert_eq!(r, b);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_max_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_max_epi32(a, b);
assert_eq!(r, b);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_max_epi8() {
let a = i8x32::splat(2);
let b = i8x32::splat(4);
let r = avx2::_mm256_max_epi8(a, b);
assert_eq!(r, b);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_max_epu16() {
let a = u16x16::splat(2);
let b = u16x16::splat(4);
let r = avx2::_mm256_max_epu16(a, b);
assert_eq!(r, b);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_max_epu32() {
let a = u32x8::splat(2);
let b = u32x8::splat(4);
let r = avx2::_mm256_max_epu32(a, b);
assert_eq!(r, b);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_max_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_max_epu8(a, b);
assert_eq!(r, b);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_min_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_min_epi16(a, b);
assert_eq!(r, a);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_min_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_min_epi32(a, b);
assert_eq!(r, a);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_min_epi8() {
let a = i8x32::splat(2);
let b = i8x32::splat(4);
let r = avx2::_mm256_min_epi8(a, b);
assert_eq!(r, a);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_min_epu16() {
let a = u16x16::splat(2);
let b = u16x16::splat(4);
let r = avx2::_mm256_min_epu16(a, b);
assert_eq!(r, a);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_min_epu32() {
let a = u32x8::splat(2);
let b = u32x8::splat(4);
let r = avx2::_mm256_min_epu32(a, b);
assert_eq!(r, a);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_min_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_min_epu8(a, b);
assert_eq!(r, a);
}
/**
// TODO this fails in debug but not release, why?
#[test]
#[target_feature ="+avx2"]
fn _mm256_movemask_epi8() {
let a = i8x32::splat(-1);
let r = avx2::_mm256_movemask_epi8(a);
let e : i32 = -1;
assert_eq!(r, e);
}
// TODO This fails in debug but not in release, whhhy?
#[test]
#[target_feature = "+avx2"]
fn _mm256_mpsadbw_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_mpsadbw_epu8(a, b, 0);
let e = u16x16::splat(8);
assert_eq!(r, e);
}
**/
#[test]
#[target_feature = "+avx2"]
fn _mm256_mul_epi32() {
let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx2::_mm256_mul_epi32(a, b);
let e = i64x4::new(0, 0, 10, 14);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_mul_epu32() {
let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let r = avx2::_mm256_mul_epu32(a, b);
let e = u64x4::new(0, 0, 10, 14);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_mulhi_epi16() {
let a = i16x16::splat(6535);
let b = i16x16::splat(6535);
let r = avx2::_mm256_mulhi_epi16(a, b);
let e = i16x16::splat(651);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_mulhi_epu16() {
let a = u16x16::splat(6535);
let b = u16x16::splat(6535);
let r = avx2::_mm256_mulhi_epu16(a, b);
let e = u16x16::splat(651);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_mullo_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_mullo_epi16(a, b);
let e = i16x16::splat(8);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_mullo_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_mullo_epi32(a, b);
let e = i32x8::splat(8);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_mulhrs_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_mullo_epi16(a, b);
let e = i16x16::splat(8);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_or_si256() {
let a = __m256i::splat(-1);
let b = __m256i::splat(0);
let r = avx2::_mm256_or_si256(a, b);
assert_eq!(r, a);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_packs_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_packs_epi16(a, b);
let e = i8x32::new(
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4,
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_packs_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_packs_epi32(a, b);
let e = i16x16::new(
2, 2, 2, 2,
4, 4, 4, 4,
2, 2, 2, 2,
4, 4, 4, 4);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_packus_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(4);
let r = avx2::_mm256_packus_epi16(a, b);
let e = u8x32::new(
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4,
2, 2, 2, 2, 2, 2, 2, 2,
4, 4, 4, 4, 4, 4, 4, 4);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_packus_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(4);
let r = avx2::_mm256_packus_epi32(a, b);
let e = u16x16::new(
2, 2, 2, 2,
4, 4, 4, 4,
2, 2, 2, 2,
4, 4, 4, 4);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_sad_epu8() {
let a = u8x32::splat(2);
let b = u8x32::splat(4);
let r = avx2::_mm256_sad_epu8(a, b);
let e = u64x4::splat(16);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_sign_epi16() {
let a = i16x16::splat(2);
let b = i16x16::splat(-1);
let r = avx2::_mm256_sign_epi16(a, b);
let e = i16x16::splat(-2);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_sign_epi32() {
let a = i32x8::splat(2);
let b = i32x8::splat(-1);
let r = avx2::_mm256_sign_epi32(a, b);
let e = i32x8::splat(-2);
assert_eq!(r, e);
}
#[test]
#[target_feature = "+avx2"]
fn _mm256_sign_epi8() {
let a = i8x32::splat(2);
let b = i8x32::splat(-1);
let r = avx2::_mm256_sign_epi8(a, b);
let e = i8x32::splat(-2);
assert_eq!(r, e);
}
}