mm256_srli,slli_si256; mm256_bsrli,bslli_epi128 to const generics (#1067)

This commit is contained in:
minybot 2021-03-09 22:44:27 -05:00 committed by GitHub
parent f0a9100c7f
commit 7583a92072
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 381 additions and 445 deletions

View file

@ -2565,17 +2565,12 @@ pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpslldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
_mm256_bslli_epi128::<IMM8>(a)
}
/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
@ -2583,17 +2578,52 @@ pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpslldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = simd_shuffle32(
zero,
a,
[
32 - (IMM8 as u32 & 0xff),
33 - (IMM8 as u32 & 0xff),
34 - (IMM8 as u32 & 0xff),
35 - (IMM8 as u32 & 0xff),
36 - (IMM8 as u32 & 0xff),
37 - (IMM8 as u32 & 0xff),
38 - (IMM8 as u32 & 0xff),
39 - (IMM8 as u32 & 0xff),
40 - (IMM8 as u32 & 0xff),
41 - (IMM8 as u32 & 0xff),
42 - (IMM8 as u32 & 0xff),
43 - (IMM8 as u32 & 0xff),
44 - (IMM8 as u32 & 0xff),
45 - (IMM8 as u32 & 0xff),
46 - (IMM8 as u32 & 0xff),
47 - (IMM8 as u32 & 0xff),
48 - (IMM8 as u32 & 0xff) - 16,
49 - (IMM8 as u32 & 0xff) - 16,
50 - (IMM8 as u32 & 0xff) - 16,
51 - (IMM8 as u32 & 0xff) - 16,
52 - (IMM8 as u32 & 0xff) - 16,
53 - (IMM8 as u32 & 0xff) - 16,
54 - (IMM8 as u32 & 0xff) - 16,
55 - (IMM8 as u32 & 0xff) - 16,
56 - (IMM8 as u32 & 0xff) - 16,
57 - (IMM8 as u32 & 0xff) - 16,
58 - (IMM8 as u32 & 0xff) - 16,
59 - (IMM8 as u32 & 0xff) - 16,
60 - (IMM8 as u32 & 0xff) - 16,
61 - (IMM8 as u32 & 0xff) - 16,
62 - (IMM8 as u32 & 0xff) - 16,
63 - (IMM8 as u32 & 0xff) - 16,
],
);
transmute(r)
}
/// Shifts packed 32-bit integers in `a` left by the amount
@ -2729,17 +2759,12 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpsrldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
_mm256_bsrli_epi128::<IMM8>(a)
}
/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@ -2747,17 +2772,145 @@ pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpsrldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = match IMM8 % 16 {
0 => simd_shuffle32(
a,
zero,
[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31,
],
),
1 => simd_shuffle32(
a,
zero,
[
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32,
],
),
2 => simd_shuffle32(
a,
zero,
[
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32, 32,
],
),
3 => simd_shuffle32(
a,
zero,
[
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
],
),
4 => simd_shuffle32(
a,
zero,
[
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
],
),
5 => simd_shuffle32(
a,
zero,
[
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
],
),
6 => simd_shuffle32(
a,
zero,
[
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
],
),
7 => simd_shuffle32(
a,
zero,
[
7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
],
),
8 => simd_shuffle32(
a,
zero,
[
8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
9 => simd_shuffle32(
a,
zero,
[
9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
10 => simd_shuffle32(
a,
zero,
[
10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
11 => simd_shuffle32(
a,
zero,
[
11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
12 => simd_shuffle32(
a,
zero,
[
12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
13 => simd_shuffle32(
a,
zero,
[
13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
14 => simd_shuffle32(
a,
zero,
[
14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
15 => simd_shuffle32(
a,
zero,
[
14, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
],
),
_ => zero,
};
transmute(r)
}
/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
@ -4824,7 +4977,7 @@ mod tests {
#[simd_test(enable = "avx2")]
unsafe fn test_mm256_slli_si256() {
let a = _mm256_set1_epi64x(0xFFFFFFFF);
let r = _mm256_slli_si256(a, 3);
let r = _mm256_slli_si256::<3>(a);
assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
}
@ -4923,7 +5076,7 @@ mod tests {
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
let r = _mm256_srli_si256(a, 3);
let r = _mm256_srli_si256::<3>(a);
#[rustfmt::skip]
let e = _mm256_setr_epi8(
4, 5, 6, 7, 8, 9, 10, 11,

View file

@ -8959,76 +8959,166 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
static_assert_imm8!(IMM8);
let a = a.as_i8x64();
let zero = _mm512_setzero_si512().as_i8x64();
let r: i8x64 = simd_shuffle64(
a,
zero,
[
0 + (IMM8 as u32 & 0xff) + 48,
1 + (IMM8 as u32 & 0xff) + 48,
2 + (IMM8 as u32 & 0xff) + 48,
3 + (IMM8 as u32 & 0xff) + 48,
4 + (IMM8 as u32 & 0xff) + 48,
5 + (IMM8 as u32 & 0xff) + 48,
6 + (IMM8 as u32 & 0xff) + 48,
7 + (IMM8 as u32 & 0xff) + 48,
8 + (IMM8 as u32 & 0xff) + 48,
9 + (IMM8 as u32 & 0xff) + 48,
10 + (IMM8 as u32 & 0xff) + 48,
11 + (IMM8 as u32 & 0xff) + 48,
12 + (IMM8 as u32 & 0xff) + 48,
13 + (IMM8 as u32 & 0xff) + 48,
14 + (IMM8 as u32 & 0xff) + 48,
15 + (IMM8 as u32 & 0xff) + 48,
16 + (IMM8 as u32 & 0xff) + 32,
17 + (IMM8 as u32 & 0xff) + 32,
18 + (IMM8 as u32 & 0xff) + 32,
19 + (IMM8 as u32 & 0xff) + 32,
20 + (IMM8 as u32 & 0xff) + 32,
21 + (IMM8 as u32 & 0xff) + 32,
22 + (IMM8 as u32 & 0xff) + 32,
23 + (IMM8 as u32 & 0xff) + 32,
24 + (IMM8 as u32 & 0xff) + 32,
25 + (IMM8 as u32 & 0xff) + 32,
26 + (IMM8 as u32 & 0xff) + 32,
27 + (IMM8 as u32 & 0xff) + 32,
28 + (IMM8 as u32 & 0xff) + 32,
29 + (IMM8 as u32 & 0xff) + 32,
30 + (IMM8 as u32 & 0xff) + 32,
31 + (IMM8 as u32 & 0xff) + 32,
32 + (IMM8 as u32 & 0xff) + 16,
33 + (IMM8 as u32 & 0xff) + 16,
34 + (IMM8 as u32 & 0xff) + 16,
35 + (IMM8 as u32 & 0xff) + 16,
36 + (IMM8 as u32 & 0xff) + 16,
37 + (IMM8 as u32 & 0xff) + 16,
38 + (IMM8 as u32 & 0xff) + 16,
39 + (IMM8 as u32 & 0xff) + 16,
40 + (IMM8 as u32 & 0xff) + 16,
41 + (IMM8 as u32 & 0xff) + 16,
42 + (IMM8 as u32 & 0xff) + 16,
43 + (IMM8 as u32 & 0xff) + 16,
44 + (IMM8 as u32 & 0xff) + 16,
45 + (IMM8 as u32 & 0xff) + 16,
46 + (IMM8 as u32 & 0xff) + 16,
47 + (IMM8 as u32 & 0xff) + 16,
48 + (IMM8 as u32 & 0xff),
49 + (IMM8 as u32 & 0xff),
50 + (IMM8 as u32 & 0xff),
51 + (IMM8 as u32 & 0xff),
52 + (IMM8 as u32 & 0xff),
53 + (IMM8 as u32 & 0xff),
54 + (IMM8 as u32 & 0xff),
55 + (IMM8 as u32 & 0xff),
56 + (IMM8 as u32 & 0xff),
57 + (IMM8 as u32 & 0xff),
58 + (IMM8 as u32 & 0xff),
59 + (IMM8 as u32 & 0xff),
60 + (IMM8 as u32 & 0xff),
61 + (IMM8 as u32 & 0xff),
62 + (IMM8 as u32 & 0xff),
63 + (IMM8 as u32 & 0xff),
],
);
let r: i8x64 = match IMM8 % 16 {
0 => simd_shuffle64(
a,
zero,
[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
],
),
1 => simd_shuffle64(
a,
zero,
[
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
],
),
2 => simd_shuffle64(
a,
zero,
[
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
],
),
3 => simd_shuffle64(
a,
zero,
[
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
114,
],
),
4 => simd_shuffle64(
a,
zero,
[
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
115,
],
),
5 => simd_shuffle64(
a,
zero,
[
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
115, 116,
],
),
6 => simd_shuffle64(
a,
zero,
[
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
116, 117,
],
),
7 => simd_shuffle64(
a,
zero,
[
7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25, 26, 27,
28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
116, 117, 118,
],
),
8 => simd_shuffle64(
a,
zero,
[
8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26, 27, 28,
29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45, 46, 47, 96, 97,
98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
116, 117, 118, 119,
],
),
9 => simd_shuffle64(
a,
zero,
[
9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27, 28, 29,
30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46, 47, 96, 97, 98,
99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
117, 118, 119, 120,
],
),
10 => simd_shuffle64(
a,
zero,
[
10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28, 29, 30,
31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47, 96, 97, 98, 99,
100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121,
],
),
11 => simd_shuffle64(
a,
zero,
[
11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29, 30, 31,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96, 97, 98, 99,
100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122,
],
),
12 => simd_shuffle64(
a,
zero,
[
12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30, 31, 80,
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121, 122, 123,
],
),
13 => simd_shuffle64(
a,
zero,
[
13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98, 99, 100, 101,
102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114, 115, 116, 117, 118,
119, 120, 121, 122, 123, 124,
],
),
14 => simd_shuffle64(
a,
zero,
[
14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80, 81, 82,
83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99, 100, 101, 102,
103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125,
],
),
15 => simd_shuffle64(
a,
zero,
[
15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114, 115, 116, 117, 118, 119, 120,
121, 122, 123, 124, 125, 126,
],
),
_ => zero,
};
transmute(r)
}
@ -17820,18 +17910,18 @@ mod tests {
unsafe fn test_mm512_bsrli_epi128() {
#[rustfmt::skip]
let a = _mm512_set_epi8(
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
);
let r = _mm512_bsrli_epi128::<9>(a);
let r = _mm512_bsrli_epi128::<3>(a);
#[rustfmt::skip]
let e = _mm512_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
);
assert_eq_m512i(r, e);
}

View file

@ -4,7 +4,7 @@
use crate::{
core_arch::{simd::*, x86::*},
hint::unreachable_unchecked,
// hint::unreachable_unchecked,
mem::transmute,
};
@ -42,22 +42,6 @@ pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
transmute(llvm_vcvtph2ps_256(transmute(a)))
}
macro_rules! dispatch_rounding {
($rounding:ident, $call:ident) => {{
match $rounding {
0 => call!(0),
1 => call!(1),
2 => call!(2),
3 => call!(3),
4 => call!(4),
5 => call!(5),
6 => call!(6),
7 => call!(7),
_ => unreachable_unchecked(),
}
}};
}
/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
/// vector.
@ -71,16 +55,13 @@ macro_rules! dispatch_rounding {
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
let a = transmute(a);
macro_rules! call {
($rounding:expr) => {
llvm_vcvtps2ph_128(a, $rounding)
};
}
transmute(dispatch_rounding!(imm_rounding, call))
#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i {
static_assert_imm3!(IMM_ROUNDING);
let a = a.as_f32x4();
let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING);
transmute(r)
}
/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
@ -95,16 +76,13 @@ pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i {
let a = transmute(a);
macro_rules! call {
($rounding:expr) => {
llvm_vcvtps2ph_256(a, $rounding)
};
}
transmute(dispatch_rounding!(imm_rounding, call))
#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i {
static_assert_imm3!(IMM_ROUNDING);
let a = a.as_f32x8();
let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING);
transmute(r)
}
#[cfg(test)]
@ -116,7 +94,7 @@ mod tests {
unsafe fn test_mm_cvtph_ps() {
let array = [1_f32, 2_f32, 3_f32, 4_f32];
let float_vec: __m128 = transmute(array);
let halfs: __m128i = _mm_cvtps_ph(float_vec, 0);
let halfs: __m128i = _mm_cvtps_ph::<0>(float_vec);
let floats: __m128 = _mm_cvtph_ps(halfs);
let result: [f32; 4] = transmute(floats);
assert_eq!(result, array);
@ -126,7 +104,7 @@ mod tests {
unsafe fn test_mm256_cvtph_ps() {
let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32];
let float_vec: __m256 = transmute(array);
let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0);
let halfs: __m128i = _mm256_cvtps_ph::<0>(float_vec);
let floats: __m256 = _mm256_cvtph_ps(halfs);
let result: [f32; 8] = transmute(floats);
assert_eq!(result, array);

View file

@ -81,291 +81,6 @@ macro_rules! static_assert_imm8_scale {
};
}
// For gather instructions, the only valid values for scale are 1, 2, 4 and 8.
// This macro enforces that.
#[allow(unused)]
macro_rules! constify_imm8_gather {
($imm8:expr, $expand:ident) => {
#[allow(overflowing_literals)]
match ($imm8) {
1 => $expand!(1),
2 => $expand!(2),
4 => $expand!(4),
8 => $expand!(8),
_ => panic!("Only 1, 2, 4, and 8 are valid values"),
}
};
}
// Constifies 8 bits along with an sae option without rounding control.
// The only valid values are 0 to 255.
// This macro enforces that.
#[allow(unused)]
macro_rules! constify_imm8_sae {
($imm8:expr, $expand:ident) => {
#[allow(overflowing_literals)]
match ($imm8) & 0b1111_1111 {
0 => $expand!(0),
1 => $expand!(1),
2 => $expand!(2),
3 => $expand!(3),
4 => $expand!(4),
5 => $expand!(5),
6 => $expand!(6),
7 => $expand!(7),
8 => $expand!(8),
9 => $expand!(9),
10 => $expand!(10),
11 => $expand!(11),
12 => $expand!(12),
13 => $expand!(13),
14 => $expand!(14),
15 => $expand!(15),
16 => $expand!(16),
17 => $expand!(17),
18 => $expand!(18),
19 => $expand!(19),
20 => $expand!(20),
21 => $expand!(21),
22 => $expand!(22),
23 => $expand!(23),
24 => $expand!(24),
25 => $expand!(25),
26 => $expand!(26),
27 => $expand!(27),
28 => $expand!(28),
29 => $expand!(29),
30 => $expand!(30),
31 => $expand!(31),
32 => $expand!(32),
33 => $expand!(33),
34 => $expand!(34),
35 => $expand!(35),
36 => $expand!(36),
37 => $expand!(37),
38 => $expand!(38),
39 => $expand!(39),
40 => $expand!(40),
41 => $expand!(41),
42 => $expand!(42),
43 => $expand!(43),
44 => $expand!(44),
45 => $expand!(45),
46 => $expand!(46),
47 => $expand!(47),
48 => $expand!(48),
49 => $expand!(49),
50 => $expand!(50),
51 => $expand!(51),
52 => $expand!(52),
53 => $expand!(53),
54 => $expand!(54),
55 => $expand!(55),
56 => $expand!(56),
57 => $expand!(57),
58 => $expand!(58),
59 => $expand!(59),
60 => $expand!(60),
61 => $expand!(61),
62 => $expand!(62),
63 => $expand!(63),
64 => $expand!(64),
65 => $expand!(65),
66 => $expand!(66),
67 => $expand!(67),
68 => $expand!(68),
69 => $expand!(69),
70 => $expand!(70),
71 => $expand!(71),
72 => $expand!(72),
73 => $expand!(73),
74 => $expand!(74),
75 => $expand!(75),
76 => $expand!(76),
77 => $expand!(77),
78 => $expand!(78),
79 => $expand!(79),
80 => $expand!(80),
81 => $expand!(81),
82 => $expand!(82),
83 => $expand!(83),
84 => $expand!(84),
85 => $expand!(85),
86 => $expand!(86),
87 => $expand!(87),
88 => $expand!(88),
89 => $expand!(89),
90 => $expand!(90),
91 => $expand!(91),
92 => $expand!(92),
93 => $expand!(93),
94 => $expand!(94),
95 => $expand!(95),
96 => $expand!(96),
97 => $expand!(97),
98 => $expand!(98),
99 => $expand!(99),
100 => $expand!(100),
101 => $expand!(101),
102 => $expand!(102),
103 => $expand!(103),
104 => $expand!(104),
105 => $expand!(105),
106 => $expand!(106),
107 => $expand!(107),
108 => $expand!(108),
109 => $expand!(109),
110 => $expand!(110),
111 => $expand!(111),
112 => $expand!(112),
113 => $expand!(113),
114 => $expand!(114),
115 => $expand!(115),
116 => $expand!(116),
117 => $expand!(117),
118 => $expand!(118),
119 => $expand!(119),
120 => $expand!(120),
121 => $expand!(121),
122 => $expand!(122),
123 => $expand!(123),
124 => $expand!(124),
125 => $expand!(125),
126 => $expand!(126),
127 => $expand!(127),
128 => $expand!(128),
129 => $expand!(129),
130 => $expand!(130),
131 => $expand!(131),
132 => $expand!(132),
133 => $expand!(133),
134 => $expand!(134),
135 => $expand!(135),
136 => $expand!(136),
137 => $expand!(137),
138 => $expand!(138),
139 => $expand!(139),
140 => $expand!(140),
141 => $expand!(141),
142 => $expand!(142),
143 => $expand!(143),
144 => $expand!(144),
145 => $expand!(145),
146 => $expand!(146),
147 => $expand!(147),
148 => $expand!(148),
149 => $expand!(149),
150 => $expand!(150),
151 => $expand!(151),
152 => $expand!(152),
153 => $expand!(153),
154 => $expand!(154),
155 => $expand!(155),
156 => $expand!(156),
157 => $expand!(157),
158 => $expand!(158),
159 => $expand!(159),
160 => $expand!(160),
161 => $expand!(161),
162 => $expand!(162),
163 => $expand!(163),
164 => $expand!(164),
165 => $expand!(165),
166 => $expand!(166),
167 => $expand!(167),
168 => $expand!(168),
169 => $expand!(169),
170 => $expand!(170),
171 => $expand!(171),
172 => $expand!(172),
173 => $expand!(173),
174 => $expand!(174),
175 => $expand!(175),
176 => $expand!(176),
177 => $expand!(177),
178 => $expand!(178),
179 => $expand!(179),
180 => $expand!(180),
181 => $expand!(181),
182 => $expand!(182),
183 => $expand!(183),
184 => $expand!(184),
185 => $expand!(185),
186 => $expand!(186),
187 => $expand!(187),
188 => $expand!(188),
189 => $expand!(189),
190 => $expand!(190),
191 => $expand!(191),
192 => $expand!(192),
193 => $expand!(193),
194 => $expand!(194),
195 => $expand!(195),
196 => $expand!(196),
197 => $expand!(197),
198 => $expand!(198),
199 => $expand!(199),
200 => $expand!(200),
201 => $expand!(201),
202 => $expand!(202),
203 => $expand!(203),
204 => $expand!(204),
205 => $expand!(205),
206 => $expand!(206),
207 => $expand!(207),
208 => $expand!(208),
209 => $expand!(209),
210 => $expand!(210),
211 => $expand!(211),
212 => $expand!(212),
213 => $expand!(213),
214 => $expand!(214),
215 => $expand!(215),
216 => $expand!(216),
217 => $expand!(217),
218 => $expand!(218),
219 => $expand!(219),
220 => $expand!(220),
221 => $expand!(221),
222 => $expand!(222),
223 => $expand!(223),
224 => $expand!(224),
225 => $expand!(225),
226 => $expand!(226),
227 => $expand!(227),
228 => $expand!(228),
229 => $expand!(229),
230 => $expand!(230),
231 => $expand!(231),
232 => $expand!(232),
233 => $expand!(233),
234 => $expand!(234),
235 => $expand!(235),
236 => $expand!(236),
237 => $expand!(237),
238 => $expand!(238),
239 => $expand!(239),
240 => $expand!(240),
241 => $expand!(241),
242 => $expand!(242),
243 => $expand!(243),
244 => $expand!(244),
245 => $expand!(245),
246 => $expand!(246),
247 => $expand!(247),
248 => $expand!(248),
249 => $expand!(249),
250 => $expand!(250),
251 => $expand!(251),
252 => $expand!(252),
253 => $expand!(253),
254 => $expand!(254),
255 => $expand!(255),
_ => panic!("Invalid sae value"),
}
};
}
#[cfg(test)]
macro_rules! assert_approx_eq {
($a:expr, $b:expr, $eps:expr) => {{