[ci] check formatting (#64)
* [ci] check formatting * [rustfmt] reformat the whole library
This commit is contained in:
parent
5869eca3e9
commit
69d2ad85f3
35 changed files with 2207 additions and 1405 deletions
|
|
@ -17,9 +17,21 @@ matrix:
|
|||
script: ci/run.sh
|
||||
- install: true
|
||||
script: ci/dox.sh
|
||||
- env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
|
||||
script: |
|
||||
cargo install rustfmt-nightly
|
||||
cargo fmt -- --write-mode=diff
|
||||
cd stdsimd
|
||||
cargo fmt -- --write-mode=diff
|
||||
cd assert-instr-macro
|
||||
cargo fmt -- --write-mode=diff
|
||||
cd ../simd-test-macro
|
||||
cargo fmt -- --write-mode=diff
|
||||
allow_failures:
|
||||
- env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
|
||||
|
||||
install:
|
||||
- if [ "$NO_ADD" = "" ]; then rustup target add $TARGET; fi
|
||||
- if [ "$NO_ADD" == "" ]; then rustup target add $TARGET; fi
|
||||
|
||||
script:
|
||||
- cargo generate-lockfile
|
||||
|
|
|
|||
|
|
@ -26,7 +26,8 @@ impl Frsqrt for f64x2 {
|
|||
|
||||
let u = unsafe {
|
||||
vendor::_mm_rsqrt_ps(
|
||||
f32x4::new(t.extract(0), t.extract(1), 0., 0.)).as_f64x4()
|
||||
f32x4::new(t.extract(0), t.extract(1), 0., 0.),
|
||||
).as_f64x4()
|
||||
};
|
||||
f64x2::new(u.extract(0), u.extract(1))
|
||||
}
|
||||
|
|
@ -36,11 +37,12 @@ impl Frsqrt for f64x2 {
|
|||
use self::stdsimd::vendor;
|
||||
unsafe { vendor::vrsqrte_f32(self.as_f32x2()).as_f64x2() }
|
||||
}
|
||||
#[cfg(not(any(all(any(target_arch = "x86", target_arch = "x86_64"),
|
||||
#[cfg(not(any(all(any(target_arch = "x86",
|
||||
target_arch = "x86_64"),
|
||||
target_feature = "sse"),
|
||||
all(any(target_arch = "arm", target_arch = "aarch64"),
|
||||
target_feature = "neon")
|
||||
)))]
|
||||
all(any(target_arch = "arm",
|
||||
target_arch = "aarch64"),
|
||||
target_feature = "neon"))))]
|
||||
{
|
||||
self.replace(0, 1. / self.extract(0).sqrt());
|
||||
self.replace(1, 1. / self.extract(1).sqrt());
|
||||
|
|
@ -57,9 +59,9 @@ struct Body {
|
|||
}
|
||||
|
||||
impl Body {
|
||||
fn new(x0: f64, x1: f64, x2: f64,
|
||||
v0: f64, v1: f64, v2: f64,
|
||||
mass: f64) -> Body {
|
||||
fn new(
|
||||
x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64
|
||||
) -> Body {
|
||||
Body {
|
||||
x: [x0, x1, x2],
|
||||
_fill: 0.0,
|
||||
|
|
@ -91,7 +93,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
|
|||
|
||||
let mut i = 0;
|
||||
for j in 0..N_BODIES {
|
||||
for k in j+1..N_BODIES {
|
||||
for k in j + 1..N_BODIES {
|
||||
for m in 0..3 {
|
||||
r[i][m] = bodies[j].x[m] - bodies[k].x[m];
|
||||
}
|
||||
|
|
@ -102,14 +104,15 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
|
|||
i = 0;
|
||||
while i < N {
|
||||
for m in 0..3 {
|
||||
dx[m] = f64x2::new(r[i][m], r[i+1][m]);
|
||||
dx[m] = f64x2::new(r[i][m], r[i + 1][m]);
|
||||
}
|
||||
|
||||
dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
|
||||
distance = dsquared.frsqrt();
|
||||
for _ in 0..2 {
|
||||
distance = distance * f64x2::splat(1.5) -
|
||||
((f64x2::splat(0.5) * dsquared) * distance) * (distance * distance)
|
||||
distance = distance * f64x2::splat(1.5)
|
||||
- ((f64x2::splat(0.5) * dsquared) * distance)
|
||||
* (distance * distance)
|
||||
}
|
||||
dmag = f64x2::splat(dt) / dsquared * distance;
|
||||
dmag.store(&mut mag, i);
|
||||
|
|
@ -119,7 +122,7 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
|
|||
|
||||
i = 0;
|
||||
for j in 0..N_BODIES {
|
||||
for k in j+1..N_BODIES {
|
||||
for k in j + 1..N_BODIES {
|
||||
for m in 0..3 {
|
||||
bodies[j].v[m] -= r[i][m] * bodies[k].mass * mag[i];
|
||||
bodies[k].v[m] += r[i][m] * bodies[j].mass * mag[i];
|
||||
|
|
@ -138,15 +141,19 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 {
|
|||
let mut e = 0.0;
|
||||
for i in 0..N_BODIES {
|
||||
let bi = &bodies[i];
|
||||
e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0;
|
||||
for j in i+1..N_BODIES {
|
||||
e += bi.mass
|
||||
* (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2])
|
||||
/ 2.0;
|
||||
for j in i + 1..N_BODIES {
|
||||
let bj = &bodies[j];
|
||||
let mut dx = [0.0; 3];
|
||||
for k in 0..3 {
|
||||
dx[k] = bi.x[k] - bj.x[k];
|
||||
}
|
||||
let mut distance = 0.0;
|
||||
for &d in &dx { distance += d * d }
|
||||
for &d in &dx {
|
||||
distance += d * d
|
||||
}
|
||||
e -= bi.mass * bj.mass / distance.sqrt()
|
||||
}
|
||||
}
|
||||
|
|
@ -156,48 +163,54 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 {
|
|||
fn main() {
|
||||
let mut bodies: [Body; N_BODIES] = [
|
||||
/* sun */
|
||||
Body::new(0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0,
|
||||
SOLAR_MASS),
|
||||
Body::new(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, SOLAR_MASS),
|
||||
/* jupiter */
|
||||
Body::new(4.84143144246472090e+00,
|
||||
-1.16032004402742839e+00,
|
||||
-1.03622044471123109e-01 ,
|
||||
1.66007664274403694e-03 * DAYS_PER_YEAR,
|
||||
7.69901118419740425e-03 * DAYS_PER_YEAR,
|
||||
-6.90460016972063023e-05 * DAYS_PER_YEAR ,
|
||||
9.54791938424326609e-04 * SOLAR_MASS
|
||||
),
|
||||
Body::new(
|
||||
4.84143144246472090e+00,
|
||||
-1.16032004402742839e+00,
|
||||
-1.03622044471123109e-01,
|
||||
1.66007664274403694e-03 * DAYS_PER_YEAR,
|
||||
7.69901118419740425e-03 * DAYS_PER_YEAR,
|
||||
-6.90460016972063023e-05 * DAYS_PER_YEAR,
|
||||
9.54791938424326609e-04 * SOLAR_MASS,
|
||||
),
|
||||
/* saturn */
|
||||
Body::new(8.34336671824457987e+00,
|
||||
4.12479856412430479e+00,
|
||||
-4.03523417114321381e-01 ,
|
||||
-2.76742510726862411e-03 * DAYS_PER_YEAR,
|
||||
4.99852801234917238e-03 * DAYS_PER_YEAR,
|
||||
2.30417297573763929e-05 * DAYS_PER_YEAR ,
|
||||
2.85885980666130812e-04 * SOLAR_MASS
|
||||
),
|
||||
Body::new(
|
||||
8.34336671824457987e+00,
|
||||
4.12479856412430479e+00,
|
||||
-4.03523417114321381e-01,
|
||||
-2.76742510726862411e-03 * DAYS_PER_YEAR,
|
||||
4.99852801234917238e-03 * DAYS_PER_YEAR,
|
||||
2.30417297573763929e-05 * DAYS_PER_YEAR,
|
||||
2.85885980666130812e-04 * SOLAR_MASS,
|
||||
),
|
||||
/* uranus */
|
||||
Body::new(1.28943695621391310e+01,
|
||||
-1.51111514016986312e+01,
|
||||
-2.23307578892655734e-01 ,
|
||||
2.96460137564761618e-03 * DAYS_PER_YEAR,
|
||||
2.37847173959480950e-03 * DAYS_PER_YEAR,
|
||||
-2.96589568540237556e-05 * DAYS_PER_YEAR ,
|
||||
4.36624404335156298e-05 * SOLAR_MASS
|
||||
),
|
||||
Body::new(
|
||||
1.28943695621391310e+01,
|
||||
-1.51111514016986312e+01,
|
||||
-2.23307578892655734e-01,
|
||||
2.96460137564761618e-03 * DAYS_PER_YEAR,
|
||||
2.37847173959480950e-03 * DAYS_PER_YEAR,
|
||||
-2.96589568540237556e-05 * DAYS_PER_YEAR,
|
||||
4.36624404335156298e-05 * SOLAR_MASS,
|
||||
),
|
||||
/* neptune */
|
||||
Body::new(1.53796971148509165e+01,
|
||||
-2.59193146099879641e+01,
|
||||
1.79258772950371181e-01 ,
|
||||
2.68067772490389322e-03 * DAYS_PER_YEAR,
|
||||
1.62824170038242295e-03 * DAYS_PER_YEAR,
|
||||
-9.51592254519715870e-05 * DAYS_PER_YEAR ,
|
||||
5.15138902046611451e-05 * SOLAR_MASS
|
||||
)
|
||||
];
|
||||
Body::new(
|
||||
1.53796971148509165e+01,
|
||||
-2.59193146099879641e+01,
|
||||
1.79258772950371181e-01,
|
||||
2.68067772490389322e-03 * DAYS_PER_YEAR,
|
||||
1.62824170038242295e-03 * DAYS_PER_YEAR,
|
||||
-9.51592254519715870e-05 * DAYS_PER_YEAR,
|
||||
5.15138902046611451e-05 * SOLAR_MASS,
|
||||
),
|
||||
];
|
||||
|
||||
let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap();
|
||||
let n: usize = std::env::args()
|
||||
.nth(1)
|
||||
.expect("need one arg")
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
offset_momentum(&mut bodies);
|
||||
println!("{:.9}", energy(&bodies));
|
||||
|
|
|
|||
|
|
@ -27,8 +27,12 @@ mod example {
|
|||
|
||||
unsafe {
|
||||
vendor::_mm_cmpestri(
|
||||
vneedle, needle_len as i32, vhaystack, hay_len as i32,
|
||||
vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
|
||||
vneedle,
|
||||
needle_len as i32,
|
||||
vhaystack,
|
||||
hay_len as i32,
|
||||
vendor::_SIDD_CMP_EQUAL_ORDERED,
|
||||
) as usize
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
5
library/stdarch/rustfmt.toml
Normal file
5
library/stdarch/rustfmt.toml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
max_width = 79
|
||||
fn_call_width = 79
|
||||
wrap_comments = true
|
||||
error_on_line_overflow = false
|
||||
fn_args_density = "Compressed"
|
||||
|
|
@ -3,7 +3,9 @@
|
|||
//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
|
||||
//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
|
||||
//!
|
||||
//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
|
||||
//! [arm_ref]:
|
||||
//! http://infocenter.arm.com/help/topic/com.arm.doc.
|
||||
//! ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
|
||||
//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
|
||||
|
||||
pub use self::v6::*;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
//! ARMv6 intrinsics.
|
||||
//!
|
||||
//! The reference is [ARMv6-M Architecture Reference
|
||||
//! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html).
|
||||
//! The reference is [ARMv6-M Architecture Reference Manual][armv6m].
|
||||
//!
|
||||
//! [armv6m]:
|
||||
//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.
|
||||
//! html
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
|
@ -27,16 +30,20 @@ mod tests {
|
|||
#[test]
|
||||
fn _rev_u16() {
|
||||
unsafe {
|
||||
assert_eq!(v6::_rev_u16(0b0000_0000_1111_1111_u16), 0b1111_1111_0000_0000_u16);
|
||||
assert_eq!(
|
||||
v6::_rev_u16(0b0000_0000_1111_1111_u16),
|
||||
0b1111_1111_0000_0000_u16
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _rev_u32() {
|
||||
unsafe {
|
||||
assert_eq!(v6::_rev_u32(
|
||||
0b0000_0000_1111_1111_0000_0000_1111_1111_u32
|
||||
), 0b1111_1111_0000_0000_1111_1111_0000_0000_u32);
|
||||
assert_eq!(
|
||||
v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32),
|
||||
0b1111_1111_0000_0000_1111_1111_0000_0000_u32
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,11 @@
|
|||
//! ARMv7 intrinsics.
|
||||
//!
|
||||
//! The reference is [ARMv7-M Architecture Reference Manual (Issue
|
||||
//! E.b)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.b/index.html).
|
||||
//! E.b)][armv7m].
|
||||
//!
|
||||
//! [armv7m]:
|
||||
//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.
|
||||
//! b/index.html
|
||||
|
||||
pub use super::v6::*;
|
||||
|
||||
|
|
@ -39,7 +43,7 @@ pub unsafe fn _rbit_u32(x: u32) -> u32 {
|
|||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.bitreverse.i32"]
|
||||
#[link_name = "llvm.bitreverse.i32"]
|
||||
fn rbit_u32(i: i32) -> i32;
|
||||
}
|
||||
|
||||
|
|
@ -72,8 +76,10 @@ mod tests {
|
|||
#[test]
|
||||
fn _rbit_u32() {
|
||||
unsafe {
|
||||
assert_eq!(v7::_rbit_u32(0b0000_1010u32),
|
||||
0b0101_0000_0000_0000_0000_0000_0000_0000u32);
|
||||
assert_eq!(
|
||||
v7::_rbit_u32(0b0000_1010u32),
|
||||
0b0101_0000_0000_0000_0000_0000_0000_0000u32
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,10 +5,8 @@ use stdsimd_test::assert_instr;
|
|||
|
||||
use simd_llvm::simd_add;
|
||||
|
||||
use v64::{i8x8, i16x4, i32x2,
|
||||
u8x8, u16x4, u32x2, f32x2};
|
||||
use v128::{i8x16, i16x8, i32x4, i64x2,
|
||||
u8x16, u16x8, u32x4, u64x2, f32x4};
|
||||
use v64::{f32x2, i16x4, i32x2, i8x8, u16x4, u32x2, u8x8};
|
||||
use v128::{f32x4, i16x8, i32x4, i64x2, i8x16, u16x8, u32x4, u64x2, u8x16};
|
||||
|
||||
/// Vector add.
|
||||
#[inline(always)]
|
||||
|
|
@ -230,18 +228,9 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn vaddq_s8_() {
|
||||
let a = i8x16::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
);
|
||||
let b = i8x16::new(
|
||||
8, 7, 6, 5, 4, 3, 2, 1,
|
||||
8, 7, 6, 5, 4, 3, 2, 1,
|
||||
);
|
||||
let e = i8x16::new(
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
);
|
||||
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = i8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
let e = i8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
|
||||
let r = unsafe { vaddq_s8(a, b) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -293,18 +282,9 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn vaddq_u8_() {
|
||||
let a = u8x16::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
);
|
||||
let b = u8x16::new(
|
||||
8, 7, 6, 5, 4, 3, 2, 1,
|
||||
8, 7, 6, 5, 4, 3, 2, 1,
|
||||
);
|
||||
let e = u8x16::new(
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9,
|
||||
);
|
||||
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let b = u8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
let e = u8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
|
||||
let r = unsafe { vaddq_u8(a, b) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -366,15 +346,9 @@ mod tests {
|
|||
#[test]
|
||||
fn vaddl_s8_() {
|
||||
let v = ::std::i8::MAX;
|
||||
let a = i8x8::new(
|
||||
v, v, v, v,
|
||||
v, v, v, v,
|
||||
);
|
||||
let a = i8x8::new(v, v, v, v, v, v, v, v);
|
||||
let v = 2 * (v as i16);
|
||||
let e = i16x8::new(
|
||||
v, v, v, v,
|
||||
v, v, v, v,
|
||||
);
|
||||
let e = i16x8::new(v, v, v, v, v, v, v, v);
|
||||
let r = unsafe { vaddl_s8(a, a) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -382,13 +356,9 @@ mod tests {
|
|||
#[test]
|
||||
fn vaddl_s16_() {
|
||||
let v = ::std::i16::MAX;
|
||||
let a = i16x4::new(
|
||||
v, v, v, v,
|
||||
);
|
||||
let a = i16x4::new(v, v, v, v);
|
||||
let v = 2 * (v as i32);
|
||||
let e = i32x4::new(
|
||||
v, v, v, v,
|
||||
);
|
||||
let e = i32x4::new(v, v, v, v);
|
||||
let r = unsafe { vaddl_s16(a, a) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -396,13 +366,9 @@ mod tests {
|
|||
#[test]
|
||||
fn vaddl_s32_() {
|
||||
let v = ::std::i32::MAX;
|
||||
let a = i32x2::new(
|
||||
v, v,
|
||||
);
|
||||
let a = i32x2::new(v, v);
|
||||
let v = 2 * (v as i64);
|
||||
let e = i64x2::new(
|
||||
v, v,
|
||||
);
|
||||
let e = i64x2::new(v, v);
|
||||
let r = unsafe { vaddl_s32(a, a) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -410,15 +376,9 @@ mod tests {
|
|||
#[test]
|
||||
fn vaddl_u8_() {
|
||||
let v = ::std::u8::MAX;
|
||||
let a = u8x8::new(
|
||||
v, v, v, v,
|
||||
v, v, v, v,
|
||||
);
|
||||
let a = u8x8::new(v, v, v, v, v, v, v, v);
|
||||
let v = 2 * (v as u16);
|
||||
let e = u16x8::new(
|
||||
v, v, v, v,
|
||||
v, v, v, v,
|
||||
);
|
||||
let e = u16x8::new(v, v, v, v, v, v, v, v);
|
||||
let r = unsafe { vaddl_u8(a, a) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -426,13 +386,9 @@ mod tests {
|
|||
#[test]
|
||||
fn vaddl_u16_() {
|
||||
let v = ::std::u16::MAX;
|
||||
let a = u16x4::new(
|
||||
v, v, v, v,
|
||||
);
|
||||
let a = u16x4::new(v, v, v, v);
|
||||
let v = 2 * (v as u32);
|
||||
let e = u32x4::new(
|
||||
v, v, v, v,
|
||||
);
|
||||
let e = u32x4::new(v, v, v, v);
|
||||
let r = unsafe { vaddl_u16(a, a) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -440,13 +396,9 @@ mod tests {
|
|||
#[test]
|
||||
fn vaddl_u32_() {
|
||||
let v = ::std::u32::MAX;
|
||||
let a = u32x2::new(
|
||||
v, v,
|
||||
);
|
||||
let a = u32x2::new(v, v);
|
||||
let v = 2 * (v as u64);
|
||||
let e = u64x2::new(
|
||||
v, v,
|
||||
);
|
||||
let e = u64x2::new(v, v);
|
||||
let r = unsafe { vaddl_u32(a, a) };
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
//! ARMv8 intrinsics.
|
||||
//!
|
||||
//! The reference is [ARMv8-A Reference Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.k_10775/index.html).
|
||||
//! The reference is [ARMv8-A Reference Manual][armv8].
|
||||
//!
|
||||
//! [armv8]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.
|
||||
//! ddi0487a.k_10775/index.html
|
||||
|
||||
pub use super::v7::*;
|
||||
|
||||
|
|
@ -23,7 +26,7 @@ pub unsafe fn _clz_u64(x: u64) -> u64 {
|
|||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.bitreverse.i64"]
|
||||
#[link_name = "llvm.bitreverse.i64"]
|
||||
fn rbit_u64(i: i64) -> i64;
|
||||
}
|
||||
|
||||
|
|
@ -61,9 +64,10 @@ mod tests {
|
|||
#[test]
|
||||
fn _rev_u64() {
|
||||
unsafe {
|
||||
assert_eq!(v8::_rev_u64(
|
||||
0b0000_0000_1111_1111_0000_0000_1111_1111_u64
|
||||
), 0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64);
|
||||
assert_eq!(
|
||||
v8::_rev_u64(0b0000_0000_1111_1111_0000_0000_1111_1111_u64),
|
||||
0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -77,27 +81,32 @@ mod tests {
|
|||
#[test]
|
||||
fn _rbit_u64() {
|
||||
unsafe {
|
||||
assert_eq!(v8::_rbit_u64(
|
||||
0b0000_0000_1111_1101_0000_0000_1111_1111_u64
|
||||
), 0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64);
|
||||
assert_eq!(
|
||||
v8::_rbit_u64(0b0000_0000_1111_1101_0000_0000_1111_1111_u64),
|
||||
0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _cls_u32() {
|
||||
unsafe {
|
||||
assert_eq!(v8::_cls_u32(
|
||||
0b1111_1111_1111_1111_0000_0000_1111_1111_u32
|
||||
), 15_u32);
|
||||
assert_eq!(
|
||||
v8::_cls_u32(0b1111_1111_1111_1111_0000_0000_1111_1111_u32),
|
||||
15_u32
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn _cls_u64() {
|
||||
unsafe {
|
||||
assert_eq!(v8::_cls_u64(
|
||||
0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
|
||||
), 15_u64);
|
||||
assert_eq!(
|
||||
v8::_cls_u64(
|
||||
0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
|
||||
),
|
||||
15_u64
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,9 +44,9 @@
|
|||
//! have no runtime support for whether you CPU actually supports the
|
||||
//! instruction.
|
||||
//!
|
||||
//! CPU target feature detection is done via the `cfg_feature_enabled!` macro at
|
||||
//! runtime. This macro will detect at runtime whether the specified feature is
|
||||
//! available or not, returning true or false depending on the current CPU.
|
||||
//! CPU target feature detection is done via the `cfg_feature_enabled!` macro
|
||||
//! at runtime. This macro will detect at runtime whether the specified feature
|
||||
//! is available or not, returning true or false depending on the current CPU.
|
||||
//!
|
||||
//! ```
|
||||
//! #![feature(cfg_target_feature)]
|
||||
|
|
@ -58,7 +58,8 @@
|
|||
//! if cfg_feature_enabled!("avx2") {
|
||||
//! println!("avx2 intrinsics will work");
|
||||
//! } else {
|
||||
//! println!("avx2 intrinsics will not work, they may generate SIGILL");
|
||||
//! println!("avx2 intrinsics will not work");
|
||||
//! // undefined behavior: may generate a `SIGILL`.
|
||||
//! }
|
||||
//! }
|
||||
//! ```
|
||||
|
|
@ -93,29 +94,33 @@
|
|||
//!
|
||||
//! # Status
|
||||
//!
|
||||
//! This crate is intended for eventual inclusion into the standard library, but
|
||||
//! some work and experimentation is needed to get there! First and foremost you
|
||||
//! can help out by kicking the tires on this crate and seeing if it works for
|
||||
//! your use case! Next up you can help us fill out the [vendor
|
||||
//! intrinsics][vendor] to ensure that we've got all the SIMD support necessary.
|
||||
//! This crate is intended for eventual inclusion into the standard library,
|
||||
//! but some work and experimentation is needed to get there! First and
|
||||
//! foremost you can help out by kicking the tires on this crate and seeing if
|
||||
//! it works for your use case! Next up you can help us fill out the [vendor
|
||||
//! intrinsics][vendor] to ensure that we've got all the SIMD support
|
||||
//! necessary.
|
||||
//!
|
||||
//! The language support and status of SIMD is also still a little up in the air
|
||||
//! right now, you may be interested in a few issues along these lines:
|
||||
//! The language support and status of SIMD is also still a little up in the
|
||||
//! air right now, you may be interested in a few issues along these lines:
|
||||
//!
|
||||
//! * [Overal tracking issue for SIMD support](https://github.com/rust-lang/rust/issues/27731)
|
||||
//! * [`cfg_target_feature` tracking issue](https://github.com/rust-lang/rust/issues/29717)
|
||||
//! * [SIMD types currently not sound](https://github.com/rust-lang/rust/issues/44367)
|
||||
//! * [`#[target_feature]` improvements](https://github.com/rust-lang/rust/issues/44839)
|
||||
//! * [Overal tracking issue for SIMD support]
|
||||
//! (https://github.com/rust-lang/rust/issues/27731)
|
||||
//! * [`cfg_target_feature` tracking issue]
|
||||
//! (https://github.com/rust-lang/rust/issues/29717)
|
||||
//! * [SIMD types currently not sound]
|
||||
//! (https://github.com/rust-lang/rust/issues/44367)
|
||||
//! * [`#[target_feature]` improvements]
|
||||
//! (https://github.com/rust-lang/rust/issues/44839)
|
||||
//!
|
||||
//! [vendor]: https://github.com/rust-lang-nursery/stdsimd/issues/40
|
||||
|
||||
#![cfg_attr(feature = "strict", deny(warnings))]
|
||||
#![allow(dead_code)]
|
||||
#![allow(unused_features)]
|
||||
#![feature(
|
||||
const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
|
||||
target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
|
||||
)]
|
||||
#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
|
||||
simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
|
||||
const_atomic_usize_new, stmt_expr_attributes)]
|
||||
#![cfg_attr(test, feature(proc_macro, test))]
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
|
|
@ -240,9 +240,11 @@ macro_rules! define_integer_ops {
|
|||
i8, i16, i32, i64, isize);
|
||||
|
||||
impl ::std::fmt::LowerHex for $ty {
|
||||
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
|
||||
fn fmt(&self, f: &mut ::std::fmt::Formatter)
|
||||
-> ::std::fmt::Result {
|
||||
write!(f, "{}(", stringify!($ty))?;
|
||||
let n = ::std::mem::size_of_val(self) / ::std::mem::size_of::<$elem>();
|
||||
let n = ::std::mem::size_of_val(self)
|
||||
/ ::std::mem::size_of::<$elem>();
|
||||
for i in 0..n {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
|
|
@ -292,8 +294,7 @@ macro_rules! cfg_feature_enabled {
|
|||
/// On ARM features are only detected at compile-time using
|
||||
/// cfg(target_feature), so if this macro is executed the
|
||||
/// feature is not supported.
|
||||
#[cfg(any(target_arch = "arm",
|
||||
target_arch = "aarch64"))]
|
||||
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
|
||||
#[macro_export]
|
||||
#[doc(hidden)]
|
||||
macro_rules! __unstable_detect_feature {
|
||||
|
|
@ -302,10 +303,8 @@ macro_rules! __unstable_detect_feature {
|
|||
}
|
||||
|
||||
/// In all unsupported architectures using the macro is an error
|
||||
#[cfg(not(any(target_arch = "x86",
|
||||
target_arch = "x86_64",
|
||||
target_arch = "arm",
|
||||
target_arch = "aarch64")))]
|
||||
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
|
||||
target_arch = "arm", target_arch = "aarch64")))]
|
||||
#[macro_export]
|
||||
#[doc(hidden)]
|
||||
macro_rules! __unstable_detect_feature {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,17 @@ define_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16);
|
|||
define_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16);
|
||||
|
||||
define_common_ops!(
|
||||
f64x2, f32x4, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16);
|
||||
f64x2,
|
||||
f32x4,
|
||||
u64x2,
|
||||
i64x2,
|
||||
u32x4,
|
||||
i32x4,
|
||||
u16x8,
|
||||
i16x8,
|
||||
u8x16,
|
||||
i8x16
|
||||
);
|
||||
define_float_ops!(f64x2, f32x4);
|
||||
define_integer_ops!(
|
||||
(u64x2, u64),
|
||||
|
|
@ -60,7 +70,8 @@ define_integer_ops!(
|
|||
(u16x8, u16),
|
||||
(i16x8, i16),
|
||||
(u8x16, u8),
|
||||
(i8x16, i8));
|
||||
(i8x16, i8)
|
||||
);
|
||||
define_casts!(
|
||||
(f64x2, f32x2, as_f32x2),
|
||||
(f64x2, u64x2, as_u64x2),
|
||||
|
|
@ -79,4 +90,5 @@ define_casts!(
|
|||
(u16x8, i16x8, as_i16x8),
|
||||
(i16x8, u16x8, as_u16x8),
|
||||
(u8x16, i8x16, as_i8x16),
|
||||
(i8x16, u8x16, as_u8x16));
|
||||
(i8x16, u8x16, as_u8x16)
|
||||
);
|
||||
|
|
|
|||
|
|
@ -74,7 +74,17 @@ define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
|
|||
define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);
|
||||
|
||||
define_common_ops!(
|
||||
f64x4, f32x8, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
|
||||
f64x4,
|
||||
f32x8,
|
||||
u64x4,
|
||||
i64x4,
|
||||
u32x8,
|
||||
i32x8,
|
||||
u16x16,
|
||||
i16x16,
|
||||
u8x32,
|
||||
i8x32
|
||||
);
|
||||
define_float_ops!(f64x4, f32x8);
|
||||
define_integer_ops!(
|
||||
(u64x4, u64),
|
||||
|
|
@ -84,7 +94,8 @@ define_integer_ops!(
|
|||
(u16x16, u16),
|
||||
(i16x16, i16),
|
||||
(u8x32, u8),
|
||||
(i8x32, i8));
|
||||
(i8x32, i8)
|
||||
);
|
||||
define_casts!(
|
||||
(f64x4, f32x4, as_f32x4),
|
||||
(f64x4, u64x4, as_u64x4),
|
||||
|
|
@ -102,4 +113,5 @@ define_casts!(
|
|||
(u16x16, i16x16, as_i16x16),
|
||||
(i16x16, u16x16, as_u16x16),
|
||||
(u8x32, i8x32, as_i8x32),
|
||||
(i8x32, u8x32, as_u8x32));
|
||||
(i8x32, u8x32, as_u8x32)
|
||||
);
|
||||
|
|
|
|||
|
|
@ -120,7 +120,17 @@ define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64);
|
|||
define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64);
|
||||
|
||||
define_common_ops!(
|
||||
f64x8, f32x16, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
|
||||
f64x8,
|
||||
f32x16,
|
||||
u64x8,
|
||||
i64x8,
|
||||
u32x16,
|
||||
i32x16,
|
||||
u16x32,
|
||||
i16x32,
|
||||
u8x64,
|
||||
i8x64
|
||||
);
|
||||
define_float_ops!(f64x8, f32x16);
|
||||
define_integer_ops!(
|
||||
(u64x8, u64),
|
||||
|
|
@ -130,7 +140,8 @@ define_integer_ops!(
|
|||
(u16x32, u16),
|
||||
(i16x32, i16),
|
||||
(u8x64, u8),
|
||||
(i8x64, i8));
|
||||
(i8x64, i8)
|
||||
);
|
||||
define_casts!(
|
||||
(f64x8, f32x8, as_f32x8),
|
||||
(f64x8, u64x8, as_u64x8),
|
||||
|
|
@ -148,5 +159,5 @@ define_casts!(
|
|||
(u16x32, i16x32, as_i16x32),
|
||||
(i16x32, u16x32, as_u16x32),
|
||||
(u8x64, i8x64, as_i8x64),
|
||||
(i8x64, u8x64, as_u8x64));
|
||||
|
||||
(i8x64, u8x64, as_u8x64)
|
||||
);
|
||||
|
|
|
|||
|
|
@ -42,7 +42,8 @@ define_integer_ops!(
|
|||
(u16x4, u16),
|
||||
(i16x4, i16),
|
||||
(u8x8, u8),
|
||||
(i8x8, i8));
|
||||
(i8x8, i8)
|
||||
);
|
||||
define_casts!(
|
||||
(f32x2, f64x2, as_f64x2),
|
||||
(f32x2, u32x2, as_u32x2),
|
||||
|
|
@ -61,5 +62,4 @@ define_casts!(
|
|||
(u8x8, u16x8, as_u16x8),
|
||||
(u16x4, u32x4, as_u32x4),
|
||||
(u32x2, u64x2, as_u64x2)
|
||||
|
||||
);
|
||||
|
|
|
|||
|
|
@ -4,11 +4,19 @@
|
|||
//!
|
||||
//! The references are:
|
||||
//!
|
||||
//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
|
||||
//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf).
|
||||
//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
|
||||
//! Instruction Set Reference, A-Z][intel64_ref].
|
||||
//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
|
||||
//! System Instructions][amd64_ref].
|
||||
//!
|
||||
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29)
|
||||
//! provides a quick overview of the instructions available.
|
||||
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
|
||||
//! available.
|
||||
//!
|
||||
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||||
//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
|
||||
//! [wikipedia_bmi]:
|
||||
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
|
||||
//! 28Advanced_Bit_Manipulation.29
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
|
@ -19,7 +27,9 @@ use stdsimd_test::assert_instr;
|
|||
#[inline(always)]
|
||||
#[target_feature = "+lzcnt"]
|
||||
#[cfg_attr(test, assert_instr(lzcnt))]
|
||||
pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
|
||||
pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
|
||||
x.leading_zeros()
|
||||
}
|
||||
|
||||
/// Counts the leading most significant zero bits.
|
||||
///
|
||||
|
|
@ -27,19 +37,25 @@ pub unsafe fn _lzcnt_u32(x: u32) -> u32 { x.leading_zeros() }
|
|||
#[inline(always)]
|
||||
#[target_feature = "+lzcnt"]
|
||||
#[cfg_attr(test, assert_instr(lzcnt))]
|
||||
pub unsafe fn _lzcnt_u64(x: u64) -> u64 { x.leading_zeros() as u64 }
|
||||
pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
|
||||
x.leading_zeros() as u64
|
||||
}
|
||||
|
||||
/// Counts the bits that are set.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+popcnt"]
|
||||
#[cfg_attr(test, assert_instr(popcnt))]
|
||||
pub unsafe fn _popcnt32(x: u32) -> u32 { x.count_ones() }
|
||||
pub unsafe fn _popcnt32(x: u32) -> u32 {
|
||||
x.count_ones()
|
||||
}
|
||||
|
||||
/// Counts the bits that are set.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+popcnt"]
|
||||
#[cfg_attr(test, assert_instr(popcnt))]
|
||||
pub unsafe fn _popcnt64(x: u64) -> u64 { x.count_ones() as u64 }
|
||||
pub unsafe fn _popcnt64(x: u64) -> u64 {
|
||||
x.count_ones() as u64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@ pub unsafe fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
a + b
|
||||
}
|
||||
|
||||
/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
|
||||
/// Add packed single-precision (32-bit) floating-point elements in `a` and
|
||||
/// `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vaddps))]
|
||||
|
|
@ -26,7 +27,8 @@ pub unsafe fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 {
|
|||
a + b
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of a packed double-precision (64-bit) floating-point elements
|
||||
/// Compute the bitwise AND of a packed double-precision (64-bit)
|
||||
/// floating-point elements
|
||||
/// in `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
|
|
@ -39,7 +41,8 @@ pub unsafe fn _mm256_and_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
mem::transmute(a & b)
|
||||
}
|
||||
|
||||
/// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
|
||||
/// Compute the bitwise AND of packed single-precision (32-bit) floating-point
|
||||
/// elements in `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vandps))]
|
||||
|
|
@ -49,7 +52,8 @@ pub unsafe fn _mm256_and_ps(a: f32x8, b: f32x8) -> f32x8 {
|
|||
mem::transmute(a & b)
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR packed double-precision (64-bit) floating-point elements
|
||||
/// Compute the bitwise OR packed double-precision (64-bit) floating-point
|
||||
/// elements
|
||||
/// in `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
|
|
@ -62,7 +66,8 @@ pub unsafe fn _mm256_or_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
mem::transmute(a | b)
|
||||
}
|
||||
|
||||
/// Compute the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`.
|
||||
/// Compute the bitwise OR packed single-precision (32-bit) floating-point
|
||||
/// elements in `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vorps))]
|
||||
|
|
@ -114,7 +119,8 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
|
|||
}
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
|
||||
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
|
||||
/// elements in `a`
|
||||
/// and then AND with `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
|
|
@ -126,7 +132,8 @@ pub unsafe fn _mm256_andnot_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
mem::transmute((!a) & b)
|
||||
}
|
||||
|
||||
/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
|
||||
/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
|
||||
/// elements in `a`
|
||||
/// and then AND with `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
|
|
@ -146,8 +153,8 @@ pub unsafe fn _mm256_max_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
maxpd256(a, b)
|
||||
}
|
||||
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`,
|
||||
/// and return packed maximum values
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a`
|
||||
/// and `b`, and return packed maximum values
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmaxps))]
|
||||
|
|
@ -164,8 +171,8 @@ pub unsafe fn _mm256_min_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
minpd256(a, b)
|
||||
}
|
||||
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a` and `b`,
|
||||
/// and return packed minimum values
|
||||
/// Compare packed single-precision (32-bit) floating-point elements in `a`
|
||||
/// and `b`, and return packed minimum values
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vminps))]
|
||||
|
|
@ -182,7 +189,8 @@ pub unsafe fn _mm256_mul_pd(a: f64x4, b: f64x4) -> f64x4 {
|
|||
a * b
|
||||
}
|
||||
|
||||
/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`.
|
||||
/// Add packed single-precision (32-bit) floating-point elements in `a` and
|
||||
/// `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmulps))]
|
||||
|
|
@ -266,8 +274,8 @@ pub unsafe fn _mm256_round_pd(a: f64x4, b: i32) -> f64x4 {
|
|||
constify_imm8!(b, call)
|
||||
}
|
||||
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a` toward
|
||||
/// positive infinity.
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a`
|
||||
/// toward positive infinity.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundpd))]
|
||||
|
|
@ -275,8 +283,8 @@ pub unsafe fn _mm256_ceil_pd(a: f64x4) -> f64x4 {
|
|||
roundpd256(a, 0x02)
|
||||
}
|
||||
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a` toward
|
||||
/// negative infinity.
|
||||
/// Round packed double-precision (64-bit) floating point elements in `a`
|
||||
/// toward negative infinity.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundpd))]
|
||||
|
|
@ -292,9 +300,9 @@ pub unsafe fn _mm256_floor_pd(a: f64x4) -> f64x4 {
|
|||
/// - `0x02`: Round up, toward positive infinity.
|
||||
/// - `0x03`: Truncate the values.
|
||||
///
|
||||
/// For a complete list of options, check the LLVM docs:
|
||||
/// For a complete list of options, check [the LLVM docs][llvm_docs].
|
||||
///
|
||||
/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
|
||||
/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundps, b = 0x00))]
|
||||
|
|
@ -307,8 +315,8 @@ pub unsafe fn _mm256_round_ps(a: f32x8, b: i32) -> f32x8 {
|
|||
constify_imm8!(b, call)
|
||||
}
|
||||
|
||||
/// Round packed single-precision (32-bit) floating point elements in `a` toward
|
||||
/// positive infinity.
|
||||
/// Round packed single-precision (32-bit) floating point elements in `a`
|
||||
/// toward positive infinity.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundps))]
|
||||
|
|
@ -316,8 +324,8 @@ pub unsafe fn _mm256_ceil_ps(a: f32x8) -> f32x8 {
|
|||
roundps256(a, 0x02)
|
||||
}
|
||||
|
||||
/// Round packed single-precision (32-bit) floating point elements in `a` toward
|
||||
/// negative infinity.
|
||||
/// Round packed single-precision (32-bit) floating point elements in `a`
|
||||
/// toward negative infinity.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vroundps))]
|
||||
|
|
@ -606,7 +614,8 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
|
|||
/// Compare the lower double-precision (64-bit) floating-point element in
|
||||
/// `a` and `b` based on the comparison operand specified by `imm8`,
|
||||
/// store the result in the lower element of returned vector,
|
||||
/// and copy the upper element from `a` to the upper element of returned vector.
|
||||
/// and copy the upper element from `a` to the upper element of returned
|
||||
/// vector.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse2"]
|
||||
#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
|
||||
|
|
@ -811,7 +820,9 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
|
|||
#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
|
||||
pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
const fn add4(x: u32) -> u32 { x + 4 }
|
||||
const fn add4(x: u32) -> u32 {
|
||||
x + 4
|
||||
}
|
||||
macro_rules! shuffle4 {
|
||||
($a:expr, $b:expr, $c:expr, $d:expr) => {
|
||||
simd_shuffle8(a, _mm256_undefined_ps(), [
|
||||
|
|
@ -857,7 +868,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
|
|||
}
|
||||
}
|
||||
|
||||
/// Shuffle single-precision (32-bit) floating-point elements in `a`
|
||||
/// Shuffle single-precision (32-bit) floating-point elements in `a`
|
||||
/// using the control in `imm8`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse"]
|
||||
|
|
@ -1026,7 +1037,9 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
|
||||
pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
|
||||
pub unsafe fn _mm256_permute2f128_si256(
|
||||
a: i32x8, b: i32x8, imm8: i8
|
||||
) -> i32x8 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -1110,7 +1123,9 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
|
||||
pub unsafe fn _mm256_insertf128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
|
||||
pub unsafe fn _mm256_insertf128_si256(
|
||||
a: __m256i, b: __m128i, imm8: i32
|
||||
) -> __m256i {
|
||||
let b = i64x4::from(_mm256_castsi128_si256(b));
|
||||
let dst: i64x4 = match imm8 & 1 {
|
||||
0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]),
|
||||
|
|
@ -1166,7 +1181,8 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> f64x4 {
|
|||
ptr::copy_nonoverlapping(
|
||||
mem_addr as *const u8,
|
||||
&mut dst as *mut f64x4 as *mut u8,
|
||||
mem::size_of::<f64x4>());
|
||||
mem::size_of::<f64x4>(),
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
|
|
@ -1191,7 +1207,8 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> f32x8 {
|
|||
ptr::copy_nonoverlapping(
|
||||
mem_addr as *const u8,
|
||||
&mut dst as *mut f32x8 as *mut u8,
|
||||
mem::size_of::<f32x8>());
|
||||
mem::size_of::<f32x8>(),
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
|
|
@ -1215,12 +1232,13 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
|
|||
ptr::copy_nonoverlapping(
|
||||
mem_addr as *const u8,
|
||||
&mut dst as *mut __m256i as *mut u8,
|
||||
mem::size_of::<__m256i>());
|
||||
mem::size_of::<__m256i>(),
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
/// Store 256-bits of integer data from `a` into memory.
|
||||
/// `mem_addr` does not need to be aligned on any particular boundary.
|
||||
/// `mem_addr` does not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
|
||||
|
|
@ -1234,7 +1252,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmaskmovpd))]
|
||||
pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: i64x4) -> f64x4 {
|
||||
pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: i64x4) -> f64x4 {
|
||||
maskloadpd256(mem_addr as *const i8, mask)
|
||||
}
|
||||
|
||||
|
|
@ -1272,7 +1290,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: i64x2, a: f64x2) {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmaskmovps))]
|
||||
pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: i32x8) -> f32x8 {
|
||||
pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: i32x8) -> f32x8 {
|
||||
maskloadps256(mem_addr as *const i8, mask)
|
||||
}
|
||||
|
||||
|
|
@ -1592,7 +1610,8 @@ pub unsafe fn _mm_testnzc_ps(a: f32x4, b: f32x4) -> i32 {
|
|||
}
|
||||
|
||||
/// Set each bit of the returned mask based on the most significant bit of the
|
||||
/// corresponding packed double-precision (64-bit) floating-point element in `a`.
|
||||
/// corresponding packed double-precision (64-bit) floating-point element in
|
||||
/// `a`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmovmskpd))]
|
||||
|
|
@ -1601,7 +1620,8 @@ pub unsafe fn _mm256_movemask_pd(a: f64x4) -> i32 {
|
|||
}
|
||||
|
||||
/// Set each bit of the returned mask based on the most significant bit of the
|
||||
/// corresponding packed single-precision (32-bit) floating-point element in `a`.
|
||||
/// corresponding packed single-precision (32-bit) floating-point element in
|
||||
/// `a`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
#[cfg_attr(test, assert_instr(vmovmskps))]
|
||||
|
|
@ -1646,8 +1666,9 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
|
|||
/// vector with the supplied values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
|
||||
e: f32, f: f32, g: f32, h: f32) -> f32x8 {
|
||||
pub unsafe fn _mm256_set_ps(
|
||||
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
|
||||
) -> f32x8 {
|
||||
f32x8::new(h, g, f, e, d, c, b, a)
|
||||
}
|
||||
|
||||
|
|
@ -1655,44 +1676,45 @@ pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
|
|||
/// reverse order.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_set_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
|
||||
e04: i8, e05: i8, e06: i8, e07: i8,
|
||||
e08: i8, e09: i8, e10: i8, e11: i8,
|
||||
e12: i8, e13: i8, e14: i8, e15: i8,
|
||||
e16: i8, e17: i8, e18: i8, e19: i8,
|
||||
e20: i8, e21: i8, e22: i8, e23: i8,
|
||||
e24: i8, e25: i8, e26: i8, e27: i8,
|
||||
e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
|
||||
i8x32::new(e31, e30, e29, e28,
|
||||
e27, e26, e25, e24,
|
||||
e23, e22, e21, e20,
|
||||
e19, e18, e17, e16,
|
||||
e15, e14, e13, e12,
|
||||
e11, e10, e09, e08,
|
||||
e07, e06, e05, e04,
|
||||
e03, e02, e01, e00)
|
||||
pub unsafe fn _mm256_set_epi8(
|
||||
e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8,
|
||||
e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8,
|
||||
e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8,
|
||||
e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8,
|
||||
) -> i8x32 {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x32::new(
|
||||
e31, e30, e29, e28, e27, e26, e25, e24,
|
||||
e23, e22, e21, e20, e19, e18, e17, e16,
|
||||
e15, e14, e13, e12, e11, e10, e09, e08,
|
||||
e07, e06, e05, e04, e03, e02, e01, e00,
|
||||
)
|
||||
}
|
||||
|
||||
/// Set packed 16-bit integers in returned vector with the supplied values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_set_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
|
||||
e04: i16, e05: i16, e06: i16, e07: i16,
|
||||
e08: i16, e09: i16, e10: i16, e11: i16,
|
||||
e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
|
||||
i16x16::new(e15, e14, e13, e12,
|
||||
e11, e10, e09, e08,
|
||||
e07, e06, e05, e04,
|
||||
e03, e02, e01, e00)
|
||||
pub unsafe fn _mm256_set_epi16(
|
||||
e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16,
|
||||
e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16,
|
||||
e14: i16, e15: i16,
|
||||
) -> i16x16 {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i16x16::new(
|
||||
e15, e14, e13, e12,
|
||||
e11, e10, e09, e08,
|
||||
e07, e06, e05, e04,
|
||||
e03, e02, e01, e00,
|
||||
)
|
||||
}
|
||||
|
||||
/// Set packed 32-bit integers in returned vector with the supplied values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
|
||||
e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
|
||||
i32x8::new(e7, e6, e5, e4,
|
||||
e3, e2, e1, e0)
|
||||
pub unsafe fn _mm256_set_epi32(
|
||||
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
|
||||
) -> i32x8 {
|
||||
i32x8::new(e7, e6, e5, e4, e3, e2, e1, e0)
|
||||
}
|
||||
|
||||
/// Set packed 64-bit integers in returned vector with the supplied values.
|
||||
|
|
@ -1715,8 +1737,9 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
|
|||
/// vector with the supplied values in reverse order.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
|
||||
e: f32, f: f32, g: f32, h: f32) -> f32x8 {
|
||||
pub unsafe fn _mm256_setr_ps(
|
||||
a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
|
||||
) -> f32x8 {
|
||||
f32x8::new(a, b, c, d, e, f, g, h)
|
||||
}
|
||||
|
||||
|
|
@ -1724,46 +1747,47 @@ pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
|
|||
/// reverse order.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
|
||||
e04: i8, e05: i8, e06: i8, e07: i8,
|
||||
e08: i8, e09: i8, e10: i8, e11: i8,
|
||||
e12: i8, e13: i8, e14: i8, e15: i8,
|
||||
e16: i8, e17: i8, e18: i8, e19: i8,
|
||||
e20: i8, e21: i8, e22: i8, e23: i8,
|
||||
e24: i8, e25: i8, e26: i8, e27: i8,
|
||||
e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
|
||||
i8x32::new(e00, e01, e02, e03,
|
||||
e04, e05, e06, e07,
|
||||
e08, e09, e10, e11,
|
||||
e12, e13, e14, e15,
|
||||
e16, e17, e18, e19,
|
||||
e20, e21, e22, e23,
|
||||
e24, e25, e26, e27,
|
||||
e28, e29, e30, e31)
|
||||
pub unsafe fn _mm256_setr_epi8(
|
||||
e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8,
|
||||
e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8,
|
||||
e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8,
|
||||
e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8,
|
||||
) -> i8x32 {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x32::new(
|
||||
e00, e01, e02, e03, e04, e05, e06, e07,
|
||||
e08, e09, e10, e11, e12, e13, e14, e15,
|
||||
e16, e17, e18, e19, e20, e21, e22, e23,
|
||||
e24, e25, e26, e27, e28, e29, e30, e31,
|
||||
)
|
||||
}
|
||||
|
||||
/// Set packed 16-bit integers in returned vector with the supplied values in
|
||||
/// reverse order.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_setr_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
|
||||
e04: i16, e05: i16, e06: i16, e07: i16,
|
||||
e08: i16, e09: i16, e10: i16, e11: i16,
|
||||
e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
|
||||
i16x16::new(e00, e01, e02, e03,
|
||||
e04, e05, e06, e07,
|
||||
e08, e09, e10, e11,
|
||||
e12, e13, e14, e15)
|
||||
pub unsafe fn _mm256_setr_epi16(
|
||||
e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16,
|
||||
e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16,
|
||||
e14: i16, e15: i16,
|
||||
) -> i16x16 {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i16x16::new(
|
||||
e00, e01, e02, e03,
|
||||
e04, e05, e06, e07,
|
||||
e08, e09, e10, e11,
|
||||
e12, e13, e14, e15,
|
||||
)
|
||||
}
|
||||
|
||||
/// Set packed 32-bit integers in returned vector with the supplied values in
|
||||
/// reverse order.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx"]
|
||||
pub unsafe fn _mm256_setr_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
|
||||
e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
|
||||
i32x8::new(e0, e1, e2, e3,
|
||||
e4, e5, e6, e7)
|
||||
pub unsafe fn _mm256_setr_epi32(
|
||||
e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
|
||||
) -> i32x8 {
|
||||
i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)
|
||||
}
|
||||
|
||||
/// Set packed 64-bit integers in returned vector with the supplied values in
|
||||
|
|
@ -1798,10 +1822,13 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> f32x8 {
|
|||
#[cfg_attr(test, assert_instr(vpshufb))]
|
||||
#[cfg_attr(test, assert_instr(vinsertf128))]
|
||||
pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 {
|
||||
i8x32::new(a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a)
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x32::new(
|
||||
a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a,
|
||||
)
|
||||
}
|
||||
|
||||
/// Broadcast 16-bit integer `a` to all all elements of returned vector.
|
||||
|
|
@ -1811,8 +1838,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 {
|
|||
//#[cfg_attr(test, assert_instr(vpshufb))]
|
||||
#[cfg_attr(test, assert_instr(vinsertf128))]
|
||||
pub unsafe fn _mm256_set1_epi16(a: i16) -> i16x16 {
|
||||
i16x16::new(a, a, a, a, a, a, a, a,
|
||||
a, a, a, a, a, a, a, a)
|
||||
i16x16::new(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
|
||||
}
|
||||
|
||||
/// Broadcast 32-bit integer `a` to all elements of returned vector.
|
||||
|
|
@ -1954,7 +1980,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: f32x4) -> f32x8 {
|
|||
#[target_feature = "+avx,+sse2"]
|
||||
pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
|
||||
use x86::sse2::_mm_setzero_si128;
|
||||
let b = mem::transmute(_mm_setzero_si128());
|
||||
let b = mem::transmute(_mm_setzero_si128());
|
||||
let dst: i64x4 = simd_shuffle4(i64x2::from(a), b, [0, 1, 2, 3]);
|
||||
__m256i::from(dst)
|
||||
}
|
||||
|
|
@ -2044,22 +2070,28 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
|
|||
}
|
||||
|
||||
/// Load two 128-bit values (composed of 4 packed single-precision (32-bit)
|
||||
/// floating-point elements) from memory, and combine them into a 256-bit value.
|
||||
/// floating-point elements) from memory, and combine them into a 256-bit
|
||||
/// value.
|
||||
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse"]
|
||||
pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> f32x8 {
|
||||
pub unsafe fn _mm256_loadu2_m128(
|
||||
hiaddr: *const f32, loaddr: *const f32
|
||||
) -> f32x8 {
|
||||
use x86::sse::_mm_loadu_ps;
|
||||
let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
|
||||
_mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
|
||||
}
|
||||
|
||||
/// Load two 128-bit values (composed of 2 packed double-precision (64-bit)
|
||||
/// floating-point elements) from memory, and combine them into a 256-bit value.
|
||||
/// floating-point elements) from memory, and combine them into a 256-bit
|
||||
/// value.
|
||||
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse2"]
|
||||
pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> f64x4 {
|
||||
pub unsafe fn _mm256_loadu2_m128d(
|
||||
hiaddr: *const f64, loaddr: *const f64
|
||||
) -> f64x4 {
|
||||
use x86::sse2::_mm_loadu_pd;
|
||||
let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
|
||||
_mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
|
||||
|
|
@ -2070,7 +2102,9 @@ pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> f64
|
|||
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse2"]
|
||||
pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
|
||||
pub unsafe fn _mm256_loadu2_m128i(
|
||||
hiaddr: *const __m128i, loaddr: *const __m128i
|
||||
) -> __m256i {
|
||||
use x86::sse2::_mm_loadu_si128;
|
||||
let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
|
||||
_mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
|
||||
|
|
@ -2082,7 +2116,9 @@ pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i
|
|||
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse"]
|
||||
pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: f32x8) {
|
||||
pub unsafe fn _mm256_storeu2_m128(
|
||||
hiaddr: *mut f32, loaddr: *mut f32, a: f32x8
|
||||
) {
|
||||
use x86::sse::_mm_storeu_ps;
|
||||
let lo = _mm256_castps256_ps128(a);
|
||||
_mm_storeu_ps(loaddr, lo);
|
||||
|
|
@ -2096,7 +2132,9 @@ pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: f32x8)
|
|||
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse2"]
|
||||
pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: f64x4) {
|
||||
pub unsafe fn _mm256_storeu2_m128d(
|
||||
hiaddr: *mut f64, loaddr: *mut f64, a: f64x4
|
||||
) {
|
||||
use x86::sse2::_mm_storeu_pd;
|
||||
let lo = _mm256_castpd256_pd128(a);
|
||||
_mm_storeu_pd(loaddr, lo);
|
||||
|
|
@ -2109,7 +2147,9 @@ pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: f64x4)
|
|||
/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx,+sse2"]
|
||||
pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
|
||||
pub unsafe fn _mm256_storeu2_m128i(
|
||||
hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i
|
||||
) {
|
||||
use x86::sse2::_mm_storeu_si128;
|
||||
let lo = _mm256_castsi256_si128(a);
|
||||
_mm_storeu_si128(loaddr, lo);
|
||||
|
|
@ -2265,9 +2305,9 @@ extern "C" {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use stdsimd_test::simd_test;
|
||||
use test::black_box; // Used to inhibit constant-folding.
|
||||
use test::black_box; // Used to inhibit constant-folding.
|
||||
|
||||
use v128::{f32x4, f64x2, i8x16, i32x4, i64x2};
|
||||
use v128::{f32x4, f64x2, i32x4, i64x2, i8x16};
|
||||
use v256::*;
|
||||
use x86::avx;
|
||||
use x86::{__m128i, __m256i};
|
||||
|
|
@ -2428,7 +2468,7 @@ mod tests {
|
|||
let a = f64x4::new(1., 2., 3., 4.);
|
||||
let b = f64x4::new(5., 6., 7., 8.);
|
||||
let r = avx::_mm256_sub_pd(a, b);
|
||||
let e = f64x4::new(-4.,-4.,-4.,-4.);
|
||||
let e = f64x4::new(-4., -4., -4., -4.);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2504,7 +2544,7 @@ mod tests {
|
|||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_sqrt_pd() {
|
||||
let a = f64x4::new(4., 9., 16., 25.);
|
||||
let r = avx::_mm256_sqrt_pd(a, );
|
||||
let r = avx::_mm256_sqrt_pd(a);
|
||||
let e = f64x4::new(2., 3., 4., 5.);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -2561,7 +2601,10 @@ mod tests {
|
|||
unsafe fn _mm256_blendv_ps() {
|
||||
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
|
||||
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
|
||||
let c = f32x8::new(0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let c = f32x8::new(
|
||||
0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
|
||||
);
|
||||
let r = avx::_mm256_blendv_ps(a, b, c);
|
||||
let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.0);
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -2572,7 +2615,8 @@ mod tests {
|
|||
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
|
||||
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
|
||||
let r = avx::_mm256_dp_ps(a, b, 0xFF);
|
||||
let e = f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.);
|
||||
let e =
|
||||
f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2801,20 +2845,21 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_extract_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
let r = avx::_mm256_extract_epi8(a, 0);
|
||||
assert_eq!(r, 1);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_extract_epi16() {
|
||||
let a = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let r = avx::_mm256_extract_epi16(a, 0);
|
||||
assert_eq!(r, 0);
|
||||
}
|
||||
|
|
@ -3004,29 +3049,31 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_insert_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
let r = avx::_mm256_insert_epi8(a, 0, 31);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 0);
|
||||
25, 26, 27, 28, 29, 30, 31, 0,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_insert_epi16() {
|
||||
let a = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let r = avx::_mm256_insert_epi16(a, 0, 15);
|
||||
let e = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 0);
|
||||
let e =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3203,18 +3250,22 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_lddqu_si256() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
let p = &a as *const _;
|
||||
let r = avx::_mm256_lddqu_si256(black_box(p));
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3222,8 +3273,11 @@ mod tests {
|
|||
unsafe fn _mm256_rcp_ps() {
|
||||
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
let r = avx::_mm256_rcp_ps(a);
|
||||
let e = f32x8::new(0.99975586, 0.49987793, 0.33325195, 0.24993896,
|
||||
0.19995117, 0.16662598, 0.14282227, 0.12496948);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = f32x8::new(
|
||||
0.99975586, 0.49987793, 0.33325195, 0.24993896,
|
||||
0.19995117, 0.16662598, 0.14282227, 0.12496948,
|
||||
);
|
||||
let rel_err = 0.00048828125;
|
||||
for i in 0..8 {
|
||||
assert_approx_eq!(r.extract(i), e.extract(i), 2. * rel_err);
|
||||
|
|
@ -3234,8 +3288,11 @@ mod tests {
|
|||
unsafe fn _mm256_rsqrt_ps() {
|
||||
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
let r = avx::_mm256_rsqrt_ps(a);
|
||||
let e = f32x8::new(0.99975586, 0.7069092, 0.5772705, 0.49987793,
|
||||
0.44714355, 0.40820313, 0.3779297, 0.3534546);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = f32x8::new(
|
||||
0.99975586, 0.7069092, 0.5772705, 0.49987793,
|
||||
0.44714355, 0.40820313, 0.3779297, 0.3534546,
|
||||
);
|
||||
let rel_err = 0.00048828125;
|
||||
for i in 0..8 {
|
||||
assert_approx_eq!(r.extract(i), e.extract(i), 2. * rel_err);
|
||||
|
|
@ -3478,30 +3535,39 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_set_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = avx::_mm256_set_epi8(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
assert_eq!(r, i8x32::new(32, 31, 30, 29, 28, 27, 26, 25,
|
||||
24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4, 3, 2, 1));
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
32, 31, 30, 29, 28, 27, 26, 25,
|
||||
24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4, 3, 2, 1
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_set_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = avx::_mm256_set_epi16(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16);
|
||||
assert_eq!(r, i16x16::new(16, 15, 14, 13, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4, 3, 2, 1));
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
assert_eq!(
|
||||
r,
|
||||
i16x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_set_epi32() {
|
||||
let r = avx::_mm256_set_epi32(
|
||||
1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let r = avx::_mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
assert_eq!(r, i32x8::new(8, 7, 6, 5, 4, 3, 2, 1));
|
||||
}
|
||||
|
||||
|
|
@ -3525,30 +3591,40 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_setr_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = avx::_mm256_setr_epi8(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
assert_eq!(r, i8x32::new(1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32));
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32
|
||||
);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_setr_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = avx::_mm256_setr_epi16(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16);
|
||||
assert_eq!(r, i16x16::new(1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16));
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
assert_eq!(
|
||||
r,
|
||||
i16x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_setr_epi32() {
|
||||
let r = avx::_mm256_setr_epi32(
|
||||
1, 2, 3, 4, 5, 6, 7, 8);
|
||||
let r = avx::_mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
assert_eq!(r, i32x8::new(1, 2, 3, 4, 5, 6, 7, 8));
|
||||
}
|
||||
|
||||
|
|
@ -3614,19 +3690,25 @@ mod tests {
|
|||
unsafe fn _mm256_castps_si256() {
|
||||
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
let r = avx::_mm256_castps_si256(a);
|
||||
let e = i8x32::new(0, 0, -128, 63, 0, 0, 0, 64,
|
||||
0, 0, 64, 64, 0, 0, -128, 64,
|
||||
0, 0, -96, 64, 0, 0, -64, 64,
|
||||
0, 0, -32, 64, 0, 0, 0, 65);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
0, 0, -128, 63, 0, 0, 0, 64,
|
||||
0, 0, 64, 64, 0, 0, -128, 64,
|
||||
0, 0, -96, 64, 0, 0, -64, 64,
|
||||
0, 0, -32, 64, 0, 0, 0, 65,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_castsi256_ps() {
|
||||
let a = i8x32::new(0, 0, -128, 63, 0, 0, 0, 64,
|
||||
0, 0, 64, 64, 0, 0, -128, 64,
|
||||
0, 0, -96, 64, 0, 0, -64, 64,
|
||||
0, 0, -32, 64, 0, 0, 0, 65);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
0, 0, -128, 63, 0, 0, 0, 64,
|
||||
0, 0, 64, 64, 0, 0, -128, 64,
|
||||
0, 0, -96, 64, 0, 0, -64, 64,
|
||||
0, 0, -32, 64, 0, 0, 0, 65,
|
||||
);
|
||||
let r = avx::_mm256_castsi256_ps(a);
|
||||
let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -3711,16 +3793,23 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_set_m128i() {
|
||||
let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let hi = i8x16::new(
|
||||
17, 18, 19, 20,
|
||||
21, 22, 23, 24,
|
||||
25, 26, 27, 28,
|
||||
29, 30, 31, 32,
|
||||
);
|
||||
let lo =
|
||||
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let r = avx::_mm256_set_m128i(hi, lo);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3744,16 +3833,21 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_setr_m128i() {
|
||||
let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
let lo =
|
||||
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let hi = i8x16::new(
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
let r = avx::_mm256_setr_m128i(lo, hi);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3781,17 +3875,24 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_loadu2_m128i() {
|
||||
let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let r = avx::_mm256_loadu2_m128i(&hi as *const _ as *const _,
|
||||
&lo as *const _ as *const _);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let hi = i8x16::new(
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
let lo =
|
||||
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let r = avx::_mm256_loadu2_m128i(
|
||||
&hi as *const _ as *const _,
|
||||
&lo as *const _ as *const _,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3801,9 +3902,11 @@ mod tests {
|
|||
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
|
||||
let mut hi = _mm_undefined_ps();
|
||||
let mut lo = _mm_undefined_ps();
|
||||
avx::_mm256_storeu2_m128(&mut hi as *mut _ as *mut f32,
|
||||
&mut lo as *mut _ as *mut f32,
|
||||
a);
|
||||
avx::_mm256_storeu2_m128(
|
||||
&mut hi as *mut _ as *mut f32,
|
||||
&mut lo as *mut _ as *mut f32,
|
||||
a,
|
||||
);
|
||||
assert_eq!(hi, f32x4::new(5., 6., 7., 8.));
|
||||
assert_eq!(lo, f32x4::new(1., 2., 3., 4.));
|
||||
}
|
||||
|
|
@ -3814,9 +3917,11 @@ mod tests {
|
|||
let a = f64x4::new(1., 2., 3., 4.);
|
||||
let mut hi = _mm_undefined_pd();
|
||||
let mut lo = _mm_undefined_pd();
|
||||
avx::_mm256_storeu2_m128d(&mut hi as *mut _ as *mut f64,
|
||||
&mut lo as *mut _ as *mut f64,
|
||||
a);
|
||||
avx::_mm256_storeu2_m128d(
|
||||
&mut hi as *mut _ as *mut f64,
|
||||
&mut lo as *mut _ as *mut f64,
|
||||
a,
|
||||
);
|
||||
assert_eq!(hi, f64x2::new(3., 4.));
|
||||
assert_eq!(lo, f64x2::new(1., 2.));
|
||||
}
|
||||
|
|
@ -3824,17 +3929,26 @@ mod tests {
|
|||
#[simd_test = "avx"]
|
||||
unsafe fn _mm256_storeu2_m128i() {
|
||||
use x86::sse2::_mm_undefined_si128;
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
let mut hi = _mm_undefined_si128();
|
||||
let mut lo = _mm_undefined_si128();
|
||||
avx::_mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a);
|
||||
assert_eq!(hi, i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32));
|
||||
assert_eq!(lo, i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16));
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x16::new(
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32
|
||||
);
|
||||
|
||||
assert_eq!(hi, e);
|
||||
assert_eq!(
|
||||
lo,
|
||||
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -96,8 +96,8 @@ pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 {
|
|||
paddusw(a, b)
|
||||
}
|
||||
|
||||
/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary result,
|
||||
/// shift the result right by `n` bytes, and return the low 16 bytes.
|
||||
/// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
|
||||
/// result, shift the result right by `n` bytes, and return the low 16 bytes.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpalignr, n = 15))]
|
||||
|
|
@ -116,7 +116,9 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 {
|
|||
(a, b, n)
|
||||
};
|
||||
|
||||
const fn add(a: u32, b: u32) -> u32 { a + b }
|
||||
const fn add(a: u32, b: u32) -> u32 {
|
||||
a + b
|
||||
}
|
||||
macro_rules! shuffle {
|
||||
($shift:expr) => {
|
||||
simd_shuffle32(b, a, [
|
||||
|
|
@ -140,14 +142,22 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 {
|
|||
}
|
||||
}
|
||||
match n {
|
||||
0 => shuffle!(0), 1 => shuffle!(1),
|
||||
2 => shuffle!(2), 3 => shuffle!(3),
|
||||
4 => shuffle!(4), 5 => shuffle!(5),
|
||||
6 => shuffle!(6), 7 => shuffle!(7),
|
||||
8 => shuffle!(8), 9 => shuffle!(9),
|
||||
10 => shuffle!(10), 11 => shuffle!(11),
|
||||
12 => shuffle!(12), 13 => shuffle!(13),
|
||||
14 => shuffle!(14), 15 => shuffle!(15),
|
||||
0 => shuffle!(0),
|
||||
1 => shuffle!(1),
|
||||
2 => shuffle!(2),
|
||||
3 => shuffle!(3),
|
||||
4 => shuffle!(4),
|
||||
5 => shuffle!(5),
|
||||
6 => shuffle!(6),
|
||||
7 => shuffle!(7),
|
||||
8 => shuffle!(8),
|
||||
9 => shuffle!(9),
|
||||
10 => shuffle!(10),
|
||||
11 => shuffle!(11),
|
||||
12 => shuffle!(12),
|
||||
13 => shuffle!(13),
|
||||
14 => shuffle!(14),
|
||||
15 => shuffle!(15),
|
||||
_ => shuffle!(16),
|
||||
}
|
||||
}
|
||||
|
|
@ -174,7 +184,7 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpavgw))]
|
||||
pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
|
||||
pub unsafe fn _mm256_avg_epu16(a: u16x16, b: u16x16) -> u16x16 {
|
||||
pavgw(a, b)
|
||||
}
|
||||
|
||||
|
|
@ -182,7 +192,7 @@ pub unsafe fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpavgb))]
|
||||
pub unsafe fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 {
|
||||
pub unsafe fn _mm256_avg_epu8(a: u8x32, b: u8x32) -> u8x32 {
|
||||
pavgb(a, b)
|
||||
}
|
||||
|
||||
|
|
@ -320,8 +330,8 @@ pub unsafe fn _mm256_blend_epi16(a: i16x16, b: i16x16, imm8: i32) -> i16x16 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpblendvb))]
|
||||
pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
|
||||
pblendvb(a,b,mask)
|
||||
pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 {
|
||||
pblendvb(a, b, mask)
|
||||
}
|
||||
|
||||
/// Broadcast the low packed 8-bit integer from `a` to all elements of
|
||||
|
|
@ -628,37 +638,83 @@ pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
|||
}
|
||||
|
||||
|
||||
// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
|
||||
// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
|
||||
// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
|
||||
// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
|
||||
// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)
|
||||
// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
|
||||
// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)
|
||||
// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
|
||||
// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
|
||||
// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)
|
||||
// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
|
||||
// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
|
||||
// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)
|
||||
// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
|
||||
// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)
|
||||
// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)
|
||||
// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
|
||||
// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)
|
||||
// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)
|
||||
// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)
|
||||
// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
|
||||
// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)
|
||||
// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr,
|
||||
// __m128i vindex, __m128i mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr,
|
||||
// __m256i vindex, __m256i mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr,
|
||||
// __m128i vindex, __m128i mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr,
|
||||
// __m128i vindex, __m256i mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr,
|
||||
// __m128i vindex, __m128d mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr,
|
||||
// __m128i vindex, __m256d mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr,
|
||||
// __m128i vindex, __m128 mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr,
|
||||
// __m256i vindex, __m256 mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr,
|
||||
// __m128i vindex, __m128i mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr,
|
||||
// __m256i vindex, __m128i mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr,
|
||||
// __m128i vindex, __m128i mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr,
|
||||
// __m256i vindex, __m256i mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr,
|
||||
// __m128i vindex, __m128d mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr,
|
||||
// __m256i vindex, __m256d mask,
|
||||
// const int scale)
|
||||
// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr,
|
||||
// __m128i vindex, __m128 mask,
|
||||
// const int scale)
|
||||
// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex,
|
||||
// const int scale)
|
||||
// TODO _mm256_mask_i64gather_ps
|
||||
// TODO _mm256_inserti128_si256
|
||||
|
||||
|
|
@ -946,7 +1002,7 @@ pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpmullw))]
|
||||
pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
|
||||
pub unsafe fn _mm256_mullo_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
||||
a * b
|
||||
}
|
||||
|
||||
|
|
@ -957,7 +1013,7 @@ pub unsafe fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpmulld))]
|
||||
pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
|
||||
pub unsafe fn _mm256_mullo_epi32(a: i32x8, b: i32x8) -> i32x8 {
|
||||
a * b
|
||||
}
|
||||
|
||||
|
|
@ -968,7 +1024,7 @@ pub unsafe fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpmulhrsw))]
|
||||
pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
|
||||
pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
||||
pmulhrsw(a, b)
|
||||
}
|
||||
|
||||
|
|
@ -1088,7 +1144,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpsadbw))]
|
||||
pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
|
||||
pub unsafe fn _mm256_sad_epu8(a: u8x32, b: u8x32) -> u64x4 {
|
||||
psadbw(a, b)
|
||||
}
|
||||
|
||||
|
|
@ -1580,15 +1636,19 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
|
|||
/// use stdsimd::simd::i8x32;
|
||||
/// use stdsimd::vendor::_mm256_unpackhi_epi8;
|
||||
///
|
||||
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
|
||||
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
|
||||
/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
|
||||
///
|
||||
/// let c: i8x32;
|
||||
/// unsafe {
|
||||
/// c = _mm256_unpackhi_epi8(a, b);
|
||||
/// }
|
||||
///
|
||||
/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, 31,-31);
|
||||
/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13,
|
||||
/// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30,
|
||||
/// 31,-31);
|
||||
/// assert_eq!(c, expected);
|
||||
///
|
||||
/// # }
|
||||
|
|
@ -1600,7 +1660,13 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
|
|||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpunpckhbw))]
|
||||
pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
|
||||
simd_shuffle32(a, b, [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63])
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
simd_shuffle32(a, b, [
|
||||
8, 40, 9, 41, 10, 42, 11, 43,
|
||||
12, 44, 13, 45, 14, 46, 15, 47,
|
||||
24, 56, 25, 57, 26, 58, 27, 59,
|
||||
28, 60, 29, 61, 30, 62, 31, 63,
|
||||
])
|
||||
}
|
||||
|
||||
/// Unpack and interleave 8-bit integers from the low half of each
|
||||
|
|
@ -1619,15 +1685,18 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
|
|||
/// use stdsimd::simd::i8x32;
|
||||
/// use stdsimd::vendor::_mm256_unpacklo_epi8;
|
||||
///
|
||||
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
|
||||
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
/// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,
|
||||
/// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
|
||||
///
|
||||
/// let c: i8x32;
|
||||
/// unsafe {
|
||||
/// c = _mm256_unpacklo_epi8(a, b);
|
||||
/// }
|
||||
///
|
||||
/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
|
||||
/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7,
|
||||
/// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
|
||||
/// assert_eq!(c, expected);
|
||||
///
|
||||
/// # }
|
||||
|
|
@ -1639,7 +1708,13 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
|
|||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpunpcklbw))]
|
||||
pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
|
||||
simd_shuffle32(a, b, [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55])
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
simd_shuffle32(a, b, [
|
||||
0, 32, 1, 33, 2, 34, 3, 35,
|
||||
4, 36, 5, 37, 6, 38, 7, 39,
|
||||
16, 48, 17, 49, 18, 50, 19, 51,
|
||||
20, 52, 21, 53, 22, 54, 23, 55,
|
||||
])
|
||||
}
|
||||
|
||||
/// Unpack and interleave 16-bit integers from the high half of each
|
||||
|
|
@ -1666,7 +1741,8 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
|
|||
/// c = _mm256_unpackhi_epi16(a, b);
|
||||
/// }
|
||||
///
|
||||
/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, 15,-15);
|
||||
/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14,
|
||||
/// 15,-15);
|
||||
/// assert_eq!(c, expected);
|
||||
///
|
||||
/// # }
|
||||
|
|
@ -1678,7 +1754,11 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
|
|||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpunpckhwd))]
|
||||
pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
||||
simd_shuffle16(a, b, [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31])
|
||||
simd_shuffle16(
|
||||
a,
|
||||
b,
|
||||
[4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
|
||||
)
|
||||
}
|
||||
|
||||
/// Unpack and interleave 16-bit integers from the low half of each
|
||||
|
|
@ -1705,7 +1785,8 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
|||
/// c = _mm256_unpacklo_epi16(a, b);
|
||||
/// }
|
||||
///
|
||||
/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, 11,-11);
|
||||
/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10,
|
||||
/// 11,-11);
|
||||
/// assert_eq!(c, expected);
|
||||
///
|
||||
/// # }
|
||||
|
|
@ -1717,7 +1798,11 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
|||
#[target_feature = "+avx2"]
|
||||
#[cfg_attr(test, assert_instr(vpunpcklwd))]
|
||||
pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 {
|
||||
simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27])
|
||||
simd_shuffle16(
|
||||
a,
|
||||
b,
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
|
||||
)
|
||||
}
|
||||
|
||||
/// Unpack and interleave 32-bit integers from the high half of each
|
||||
|
|
@ -1972,9 +2057,9 @@ extern "C" {
|
|||
#[link_name = "llvm.x86.avx2.pmulh.w"]
|
||||
fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx2.pmul.dq"]
|
||||
fn pmuldq(a: i32x8, b:i32x8) -> i64x4;
|
||||
fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.pmulu.dq"]
|
||||
fn pmuludq(a: u32x8, b:u32x8) -> u64x4;
|
||||
fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
|
||||
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]
|
||||
fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx2.packsswb"]
|
||||
|
|
@ -2006,17 +2091,17 @@ extern "C" {
|
|||
#[link_name = "llvm.x86.avx2.pslli.q"]
|
||||
fn pslliq(a: i64x4, imm8: i32) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.psllv.d"]
|
||||
fn psllvd(a:i32x4, count:i32x4) -> i32x4;
|
||||
fn psllvd(a: i32x4, count: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.psllv.d.256"]
|
||||
fn psllvd256(a:i32x8, count:i32x8) -> i32x8;
|
||||
fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
|
||||
#[link_name = "llvm.x86.avx2.psllv.q"]
|
||||
fn psllvq(a:i64x2, count:i64x2) -> i64x2;
|
||||
fn psllvq(a: i64x2, count: i64x2) -> i64x2;
|
||||
#[link_name = "llvm.x86.avx2.psllv.q.256"]
|
||||
fn psllvq256(a:i64x4, count:i64x4) -> i64x4;
|
||||
fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.psra.w"]
|
||||
fn psraw(a: i16x16, count:i16x8) -> i16x16;
|
||||
fn psraw(a: i16x16, count: i16x8) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx2.psra.d"]
|
||||
fn psrad(a: i32x8, count:i32x4) -> i32x8;
|
||||
fn psrad(a: i32x8, count: i32x4) -> i32x8;
|
||||
#[link_name = "llvm.x86.avx2.psrai.w"]
|
||||
fn psraiw(a: i16x16, imm8: i32) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx2.psrai.d"]
|
||||
|
|
@ -2026,11 +2111,11 @@ extern "C" {
|
|||
#[link_name = "llvm.x86.avx2.psrav.d.256"]
|
||||
fn psravd256(a: i32x8, count: i32x8) -> i32x8;
|
||||
#[link_name = "llvm.x86.avx2.psrl.w"]
|
||||
fn psrlw(a: i16x16, count:i16x8) -> i16x16;
|
||||
fn psrlw(a: i16x16, count: i16x8) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx2.psrl.d"]
|
||||
fn psrld(a: i32x8, count:i32x4) -> i32x8;
|
||||
fn psrld(a: i32x8, count: i32x4) -> i32x8;
|
||||
#[link_name = "llvm.x86.avx2.psrl.q"]
|
||||
fn psrlq(a: i64x4, count:i64x2) -> i64x4;
|
||||
fn psrlq(a: i64x4, count: i64x2) -> i64x4;
|
||||
#[link_name = "llvm.x86.avx2.psrli.w"]
|
||||
fn psrliw(a: i16x16, imm8: i32) -> i16x16;
|
||||
#[link_name = "llvm.x86.avx2.psrli.d"]
|
||||
|
|
@ -2071,49 +2156,53 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_abs_epi32() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i32x8::new(
|
||||
0, 1, -1, std::i32::MAX,
|
||||
std::i32::MIN + 1, 100, -100, -32);
|
||||
std::i32::MIN + 1, 100, -100, -32,
|
||||
);
|
||||
let r = avx2::_mm256_abs_epi32(a);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i32x8::new(
|
||||
0, 1, 1, std::i32::MAX,
|
||||
(std::i32::MIN + 1).abs(), 100, 100, 32);
|
||||
(std::i32::MIN + 1).abs(), 100, 100, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_abs_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i16x16::new(
|
||||
0, 1, -1, 2,
|
||||
-2, 3, -3, 4,
|
||||
-4, 5, -5, std::i16::MAX,
|
||||
std::i16::MIN + 1, 100, -100, -32);
|
||||
0, 1, -1, 2, -2, 3, -3, 4,
|
||||
-4, 5, -5, std::i16::MAX, std::i16::MIN + 1, 100, -100, -32,
|
||||
);
|
||||
let r = avx2::_mm256_abs_epi16(a);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x16::new(
|
||||
0, 1, 1, 2,
|
||||
2, 3, 3, 4,
|
||||
4, 5, 5, std::i16::MAX,
|
||||
(std::i16::MIN + 1).abs(), 100, 100, 32);
|
||||
0, 1, 1, 2, 2, 3, 3, 4,
|
||||
4, 5, 5, std::i16::MAX, (std::i16::MIN + 1).abs(), 100, 100, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_abs_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
0, 1, -1, 2,
|
||||
-2, 3, -3, 4,
|
||||
-4, 5, -5, std::i8::MAX,
|
||||
std::i8::MIN + 1, 100, -100, -32,
|
||||
0, 1, -1, 2,
|
||||
-2, 3, -3, 4,
|
||||
-4, 5, -5, std::i8::MAX,
|
||||
std::i8::MIN + 1, 100, -100, -32);
|
||||
0, 1, -1, 2, -2, 3, -3, 4,
|
||||
-4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32,
|
||||
0, 1, -1, 2, -2, 3, -3, 4,
|
||||
-4, 5, -5, std::i8::MAX, std::i8::MIN + 1, 100, -100, -32,
|
||||
);
|
||||
let r = avx2::_mm256_abs_epi8(a);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
0, 1, 1, 2, 2, 3, 3, 4,
|
||||
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32,
|
||||
0, 1, 1, 2, 2, 3, 3, 4,
|
||||
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32);
|
||||
4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2137,52 +2226,70 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_add_epi16() {
|
||||
let a = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let b = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let b =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let r = avx2::_mm256_add_epi16(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x16::new(
|
||||
0, 2, 4, 6, 8, 10, 12, 14,
|
||||
16, 18, 20, 22, 24, 26, 28, 30);
|
||||
16, 18, 20, 22, 24, 26, 28, 30,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_add_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31);
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x32::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31);
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let r = avx2::_mm256_add_epi8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
0, 2, 4, 6, 8, 10, 12, 14, 16,
|
||||
18, 20, 22, 24, 26, 28, 30, 32,
|
||||
34, 36, 38, 40, 42, 44, 46, 48,
|
||||
50, 52, 54, 56, 58, 60, 62);
|
||||
0, 2, 4, 6, 8, 10, 12, 14,
|
||||
16, 18, 20, 22, 24, 26, 28, 30,
|
||||
32, 34, 36, 38, 40, 42, 44, 46,
|
||||
48, 50, 52, 54, 56, 58, 60, 62,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_adds_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x32::new(
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63,
|
||||
);
|
||||
let r = avx2::_mm256_adds_epi8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
|
||||
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
|
||||
32, 34, 36, 38, 40, 42, 44, 46,
|
||||
48, 50, 52, 54, 56, 58, 60, 62,
|
||||
64, 66, 68, 70, 72, 74, 76, 78,
|
||||
80, 82, 84, 86, 88, 90, 92, 94,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2204,13 +2311,19 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_adds_epi16() {
|
||||
let a = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i16x16::new(
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
|
||||
let r = avx2::_mm256_adds_epi16(a, b);
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
);
|
||||
let r = avx2::_mm256_adds_epi16(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x16::new(
|
||||
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);
|
||||
32, 34, 36, 38, 40, 42, 44, 46,
|
||||
48, 50, 52, 54, 56, 58, 60, 62,
|
||||
);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -2233,16 +2346,28 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_adds_epu8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = u8x32::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = u8x32::new(
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63,
|
||||
);
|
||||
let r = avx2::_mm256_adds_epu8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = u8x32::new(
|
||||
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
|
||||
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94);
|
||||
32, 34, 36, 38, 40, 42, 44, 46,
|
||||
48, 50, 52, 54, 56, 58, 60, 62,
|
||||
64, 66, 68, 70, 72, 74, 76, 78,
|
||||
80, 82, 84, 86, 88, 90, 92, 94,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2257,13 +2382,19 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_adds_epu16() {
|
||||
let a = u16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
u16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = u16x16::new(
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47);
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
);
|
||||
let r = avx2::_mm256_adds_epu16(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = u16x16::new(
|
||||
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62);
|
||||
32, 34, 36, 38, 40, 42, 44, 46,
|
||||
48, 50, 52, 54, 56, 58, 60, 62,
|
||||
);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -2346,11 +2477,11 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_blendv_epi8() {
|
||||
let (a,b) = (i8x32::splat(4),i8x32::splat(2));
|
||||
let mask = i8x32::splat(0).replace(2,-1);
|
||||
let e = i8x32::splat(4).replace(2,2);
|
||||
let r= avx2::_mm256_blendv_epi8(a,b,mask);
|
||||
assert_eq!(r,e);
|
||||
let (a, b) = (i8x32::splat(4), i8x32::splat(2));
|
||||
let mask = i8x32::splat(0).replace(2, -1);
|
||||
let e = i8x32::splat(4).replace(2, 2);
|
||||
let r = avx2::_mm256_blendv_epi8(a, b, mask);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -2413,8 +2544,12 @@ mod tests {
|
|||
unsafe fn _mm256_broadcastsi128_si256() {
|
||||
let a = i64x2::new(0x0987654321012334, 0x5678909876543210);
|
||||
let res = avx2::_mm256_broadcastsi128_si256(a);
|
||||
let retval = i64x4::new(0x0987654321012334, 0x5678909876543210,
|
||||
0x0987654321012334, 0x5678909876543210);
|
||||
let retval = i64x4::new(
|
||||
0x0987654321012334,
|
||||
0x5678909876543210,
|
||||
0x0987654321012334,
|
||||
0x5678909876543210,
|
||||
);
|
||||
assert_eq!(res, retval);
|
||||
}
|
||||
|
||||
|
|
@ -2448,30 +2583,38 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_cmpeq_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x32::new(
|
||||
31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
31, 30, 2, 28, 27, 26, 25, 24,
|
||||
23, 22, 21, 20, 19, 18, 17, 16,
|
||||
15, 14, 13, 12, 11, 10, 9, 8,
|
||||
7, 6, 5, 4, 3, 2, 1, 0,
|
||||
);
|
||||
let r = avx2::_mm256_cmpeq_epi8(a, b);
|
||||
assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8));
|
||||
assert_eq!(r, i8x32::splat(0).replace(2, 0xFFu8 as i8));
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_cmpeq_epi16() {
|
||||
let a = i16x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let b = i16x16::new(
|
||||
15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
let a =
|
||||
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let b =
|
||||
i16x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
let r = avx2::_mm256_cmpeq_epi16(a, b);
|
||||
assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16));
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_cmpeq_epi32() {
|
||||
let a = i32x8::new(0, 1, 2, 3,4,5,6,7);
|
||||
let b = i32x8::new(7,6,2,4,3, 2, 1, 0);
|
||||
let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
let b = i32x8::new(7, 6, 2, 4, 3, 2, 1, 0);
|
||||
let r = avx2::_mm256_cmpeq_epi32(a, b);
|
||||
assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32));
|
||||
}
|
||||
|
|
@ -2481,8 +2624,10 @@ mod tests {
|
|||
let a = i64x4::new(0, 1, 2, 3);
|
||||
let b = i64x4::new(3, 2, 2, 0);
|
||||
let r = avx2::_mm256_cmpeq_epi64(a, b);
|
||||
assert_eq!(r, i64x4::splat(0).replace(
|
||||
2, 0xFFFFFFFFFFFFFFFFu64 as i64));
|
||||
assert_eq!(
|
||||
r,
|
||||
i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -2514,27 +2659,33 @@ mod tests {
|
|||
let a = i64x4::splat(0).replace(0, 5);
|
||||
let b = i64x4::splat(0);
|
||||
let r = avx2::_mm256_cmpgt_epi64(a, b);
|
||||
assert_eq!(r, i64x4::splat(0).replace(
|
||||
0, 0xFFFFFFFFFFFFFFFFu64 as i64));
|
||||
assert_eq!(
|
||||
r,
|
||||
i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_cvtepi8_epi16() {
|
||||
let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let r = i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let a =
|
||||
i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let r =
|
||||
i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
assert_eq!(r, avx2::_mm256_cvtepi8_epi16(a));
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_cvtepi8_epi32() {
|
||||
let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let a =
|
||||
i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let r = i32x8::new(0, 0, -1, 1, -2, 2, -3, 3);
|
||||
assert_eq!(r, avx2::_mm256_cvtepi8_epi32(a));
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_cvtepi8_epi64() {
|
||||
let a = i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let a =
|
||||
i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7);
|
||||
let r = i64x4::new(0, 0, -1, 1);
|
||||
assert_eq!(r, avx2::_mm256_cvtepi8_epi64(a));
|
||||
}
|
||||
|
|
@ -2580,11 +2731,11 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_hadds_epi16() {
|
||||
let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1);
|
||||
let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, 1);
|
||||
let b = i16x16::splat(4);
|
||||
let r = avx2::_mm256_hadds_epi16(a, b);
|
||||
let e = i16x16::new(
|
||||
0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
|
||||
let e =
|
||||
i16x16::new(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2608,10 +2759,10 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_hsubs_epi16() {
|
||||
let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1);
|
||||
let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, -1);
|
||||
let b = i16x16::splat(4);
|
||||
let r = avx2::_mm256_hsubs_epi16(a, b);
|
||||
let e = i16x16::splat(0).replace(0,0x7FFF);
|
||||
let e = i16x16::splat(0).replace(0, 0x7FFF);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2902,11 +3053,13 @@ mod tests {
|
|||
let a = i16x16::splat(2);
|
||||
let b = i16x16::splat(4);
|
||||
let r = avx2::_mm256_packs_epi16(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x32::new(
|
||||
2, 2, 2, 2, 2, 2, 2, 2,
|
||||
4, 4, 4, 4, 4, 4, 4, 4,
|
||||
2, 2, 2, 2, 2, 2, 2, 2,
|
||||
4, 4, 4, 4, 4, 4, 4, 4);
|
||||
4, 4, 4, 4, 4, 4, 4, 4,
|
||||
);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -2916,11 +3069,7 @@ mod tests {
|
|||
let a = i32x8::splat(2);
|
||||
let b = i32x8::splat(4);
|
||||
let r = avx2::_mm256_packs_epi32(a, b);
|
||||
let e = i16x16::new(
|
||||
2, 2, 2, 2,
|
||||
4, 4, 4, 4,
|
||||
2, 2, 2, 2,
|
||||
4, 4, 4, 4);
|
||||
let e = i16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -2930,11 +3079,13 @@ mod tests {
|
|||
let a = i16x16::splat(2);
|
||||
let b = i16x16::splat(4);
|
||||
let r = avx2::_mm256_packus_epi16(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = u8x32::new(
|
||||
2, 2, 2, 2, 2, 2, 2, 2,
|
||||
4, 4, 4, 4, 4, 4, 4, 4,
|
||||
2, 2, 2, 2, 2, 2, 2, 2,
|
||||
4, 4, 4, 4, 4, 4, 4, 4);
|
||||
4, 4, 4, 4, 4, 4, 4, 4,
|
||||
);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -2944,11 +3095,7 @@ mod tests {
|
|||
let a = i32x8::splat(2);
|
||||
let b = i32x8::splat(4);
|
||||
let r = avx2::_mm256_packus_epi32(a, b);
|
||||
let e = u16x16::new(
|
||||
2, 2, 2, 2,
|
||||
4, 4, 4, 4,
|
||||
2, 2, 2, 2,
|
||||
4, 4, 4, 4);
|
||||
let e = u16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
|
||||
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
|
@ -3017,21 +3164,24 @@ mod tests {
|
|||
unsafe fn _mm256_slli_epi16() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
|
||||
i16x16::splat(0xFF0));
|
||||
i16x16::splat(0xFF0)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_slli_epi32() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4),
|
||||
i32x8::splat(0xFFFF0));
|
||||
i32x8::splat(0xFFFF0)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_slli_epi64() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4),
|
||||
i64x4::splat(0xFFFFFFFF0));
|
||||
i64x4::splat(0xFFFFFFFF0)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -3090,14 +3240,16 @@ mod tests {
|
|||
unsafe fn _mm256_srai_epi16() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_srai_epi16(i16x16::splat(-1), 1),
|
||||
i16x16::splat(-1));
|
||||
i16x16::splat(-1)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_srai_epi32() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_srai_epi32(i32x8::splat(-1), 1),
|
||||
i32x8::splat(-1));
|
||||
i32x8::splat(-1)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -3106,7 +3258,7 @@ mod tests {
|
|||
let count = i32x4::splat(1);
|
||||
let r = avx2::_mm_srav_epi32(a, count);
|
||||
let e = i32x4::splat(2);
|
||||
assert_eq!(r, e );
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -3115,7 +3267,7 @@ mod tests {
|
|||
let count = i32x8::splat(1);
|
||||
let r = avx2::_mm256_srav_epi32(a, count);
|
||||
let e = i32x8::splat(2);
|
||||
assert_eq!(r, e );
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -3146,21 +3298,24 @@ mod tests {
|
|||
unsafe fn _mm256_srli_epi16() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4),
|
||||
i16x16::splat(0xF));
|
||||
i16x16::splat(0xF)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_srli_epi32() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4),
|
||||
i32x8::splat(0xFFF));
|
||||
i32x8::splat(0xFFF)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_srli_epi64() {
|
||||
assert_eq!(
|
||||
avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4),
|
||||
i64x4::splat(0xFFFFFFF));
|
||||
i64x4::splat(0xFFFFFFF)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "avx2"]
|
||||
|
|
@ -3274,41 +3429,51 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_alignr_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32);
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x32::new(
|
||||
-1, -2, -3, -4, -5, -6, -7, -8,
|
||||
-9, -10, -11, -12, -13, -14, -15, -16,
|
||||
-17, -18, -19, -20, -21, -22, -23, -24,
|
||||
-25, -26, -27, -28, -29, -30, -31, -32);
|
||||
-25, -26, -27, -28, -29, -30, -31, -32,
|
||||
);
|
||||
let r = avx2::_mm256_alignr_epi8(a, b, 33);
|
||||
assert_eq!(r, i8x32::splat(0));
|
||||
|
||||
let r = avx2::_mm256_alignr_epi8(a, b, 17);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let expected = i8x32::new(
|
||||
2, 3, 4, 5, 6, 7, 8, 9,
|
||||
10, 11, 12, 13, 14, 15, 16, 17,
|
||||
18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 0);
|
||||
26, 27, 28, 29, 30, 31, 32, 0,
|
||||
);
|
||||
assert_eq!(r, expected);
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let expected = i8x32::new(
|
||||
-17, -18, -19, -20, -21, -22, -23, -24,
|
||||
-25, -26, -27, -28, -29, -30, -31, -32,
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16);
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = avx2::_mm256_alignr_epi8(a, b, 16);
|
||||
assert_eq!(r, expected);
|
||||
|
||||
let r = avx2::_mm256_alignr_epi8(a, b, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let expected = i8x32::new(
|
||||
-16, -17, -18, -19, -20, -21, -22, -23,
|
||||
-24, -25, -26, -27, -28, -29, -30, -31,
|
||||
-32, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
assert_eq!(r, expected);
|
||||
|
||||
let r = avx2::_mm256_alignr_epi8(a, b, 0);
|
||||
|
|
@ -3317,18 +3482,21 @@ mod tests {
|
|||
|
||||
#[simd_test = "avx2"]
|
||||
unsafe fn _mm256_shuffle_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = u8x32::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32
|
||||
25, 26, 27, 28, 29, 30, 31, 32,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = u8x32::new(
|
||||
4, 128, 4, 3, 24, 12, 6, 19,
|
||||
12, 5, 5, 10, 4, 1, 8, 0,
|
||||
4, 128, 4, 3, 24, 12, 6, 19,
|
||||
12, 5, 5, 10, 4, 1, 8, 0,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let expected = u8x32::new(
|
||||
5, 0, 5, 4, 9, 13, 7, 4,
|
||||
13, 6, 6, 11, 5, 2, 9, 1,
|
||||
|
|
|
|||
|
|
@ -1,11 +1,16 @@
|
|||
//! Bit Manipulation Instruction (BMI) Set 1.0.
|
||||
//!
|
||||
//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
|
||||
//! Manual Volume 2: Instruction Set Reference,
|
||||
//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
|
||||
//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
|
||||
//!
|
||||
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI1_.28Bit_Manipulation_Instruction_Set_1.29)
|
||||
//! provides a quick overview of the available instructions.
|
||||
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
|
||||
//! available.
|
||||
//!
|
||||
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||||
//! [wikipedia_bmi]:
|
||||
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
|
||||
//! 28Advanced_Bit_Manipulation.29
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
|
@ -32,8 +37,8 @@ pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
|
|||
/// Extracts bits of `a` specified by `control` into
|
||||
/// the least significant bits of the result.
|
||||
///
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
|
||||
/// extracted, and bits [15,8] specify the length of the range.
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to
|
||||
/// be extracted, and bits [15,8] specify the length of the range.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
|
|
@ -44,8 +49,8 @@ pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
|
|||
/// Extracts bits of `a` specified by `control` into
|
||||
/// the least significant bits of the result.
|
||||
///
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
|
||||
/// extracted, and bits [15,8] specify the length of the range.
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to
|
||||
/// be extracted, and bits [15,8] specify the length of the range.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+bmi"]
|
||||
#[cfg_attr(test, assert_instr(bextr))]
|
||||
|
|
@ -177,9 +182,9 @@ pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
|
|||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.x86.bmi.bextr.32"]
|
||||
#[link_name = "llvm.x86.bmi.bextr.32"]
|
||||
fn x86_bmi_bextr_32(x: u32, y: u32) -> u32;
|
||||
#[link_name="llvm.x86.bmi.bextr.64"]
|
||||
#[link_name = "llvm.x86.bmi.bextr.64"]
|
||||
fn x86_bmi_bextr_64(x: u64, y: u64) -> u64;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,15 @@
|
|||
//! Bit Manipulation Instruction (BMI) Set 2.0.
|
||||
//!
|
||||
//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
|
||||
//! Manual Volume 2: Instruction Set Reference,
|
||||
//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectu res-software-developer-instruction-set-reference-manual-325383.pdf).
|
||||
//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
|
||||
//!
|
||||
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI2_.28Bit_Manipulation_Instruction_Set_2.29)
|
||||
//! provides a quick overview of the available instructions.
|
||||
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions
|
||||
//! available.
|
||||
//!
|
||||
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||||
//! [wikipedia_bmi]:
|
||||
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
|
||||
//! 28Advanced_Bit_Manipulation.29
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
|
@ -96,17 +100,17 @@ pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
|
|||
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
#[link_name="llvm.x86.bmi.bzhi.32"]
|
||||
#[link_name = "llvm.x86.bmi.bzhi.32"]
|
||||
fn x86_bmi2_bzhi_32(x: u32, y: u32) -> u32;
|
||||
#[link_name="llvm.x86.bmi.bzhi.64"]
|
||||
#[link_name = "llvm.x86.bmi.bzhi.64"]
|
||||
fn x86_bmi2_bzhi_64(x: u64, y: u64) -> u64;
|
||||
#[link_name="llvm.x86.bmi.pdep.32"]
|
||||
#[link_name = "llvm.x86.bmi.pdep.32"]
|
||||
fn x86_bmi2_pdep_32(x: u32, y: u32) -> u32;
|
||||
#[link_name="llvm.x86.bmi.pdep.64"]
|
||||
#[link_name = "llvm.x86.bmi.pdep.64"]
|
||||
fn x86_bmi2_pdep_64(x: u64, y: u64) -> u64;
|
||||
#[link_name="llvm.x86.bmi.pext.32"]
|
||||
#[link_name = "llvm.x86.bmi.pext.32"]
|
||||
fn x86_bmi2_pext_32(x: u32, y: u32) -> u32;
|
||||
#[link_name="llvm.x86.bmi.pext.64"]
|
||||
#[link_name = "llvm.x86.bmi.pext.64"]
|
||||
fn x86_bmi2_pext_64(x: u64, y: u64) -> u64;
|
||||
}
|
||||
|
||||
|
|
@ -118,7 +122,7 @@ mod tests {
|
|||
|
||||
#[simd_test = "bmi2"]
|
||||
unsafe fn _pext_u32() {
|
||||
let n = 0b1011_1110_1001_0011u32;
|
||||
let n = 0b1011_1110_1001_0011u32;
|
||||
|
||||
let m0 = 0b0110_0011_1000_0101u32;
|
||||
let s0 = 0b0000_0000_0011_0101u32;
|
||||
|
|
@ -133,7 +137,7 @@ mod tests {
|
|||
#[simd_test = "bmi2"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
unsafe fn _pext_u64() {
|
||||
let n = 0b1011_1110_1001_0011u64;
|
||||
let n = 0b1011_1110_1001_0011u64;
|
||||
|
||||
let m0 = 0b0110_0011_1000_0101u64;
|
||||
let s0 = 0b0000_0000_0011_0101u64;
|
||||
|
|
@ -147,7 +151,7 @@ mod tests {
|
|||
|
||||
#[simd_test = "bmi2"]
|
||||
unsafe fn _pdep_u32() {
|
||||
let n = 0b1011_1110_1001_0011u32;
|
||||
let n = 0b1011_1110_1001_0011u32;
|
||||
|
||||
let m0 = 0b0110_0011_1000_0101u32;
|
||||
let s0 = 0b0000_0010_0000_0101u32;
|
||||
|
|
@ -162,7 +166,7 @@ mod tests {
|
|||
#[simd_test = "bmi2"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
unsafe fn _pdep_u64() {
|
||||
let n = 0b1011_1110_1001_0011u64;
|
||||
let n = 0b1011_1110_1001_0011u64;
|
||||
|
||||
let m0 = 0b0110_0011_1000_0101u64;
|
||||
let s0 = 0b0000_0010_0000_0101u64;
|
||||
|
|
@ -194,23 +198,31 @@ mod tests {
|
|||
let a: u32 = 4_294_967_200;
|
||||
let b: u32 = 2;
|
||||
let (lo, hi): (u32, u32) = bmi2::_mulx_u32(a, b);
|
||||
// result = 8589934400
|
||||
// = 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
|
||||
// ^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
/*
|
||||
result = 8589934400
|
||||
= 0b0001_1111_1111_1111_1111_1111_1111_0100_0000u64
|
||||
^~hi ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
*/
|
||||
assert_eq!(lo, 0b1111_1111_1111_1111_1111_1111_0100_0000u32);
|
||||
assert_eq!(hi, 0b0001u32);
|
||||
}
|
||||
|
||||
#[simd_test = "bmi2"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
unsafe fn _mulx_u64() {
|
||||
let a: u64 = 9_223_372_036_854_775_800;
|
||||
let b: u64 = 100;
|
||||
let (lo, hi): (u64, u64) = bmi2::_mulx_u64(a, b);
|
||||
// result = 922337203685477580000
|
||||
// = 0b00110001_11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u128
|
||||
// ^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
assert_eq!(lo, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64);
|
||||
/*
|
||||
result = 922337203685477580000 =
|
||||
0b00110001_1111111111111111_1111111111111111_1111111111111111_1111110011100000
|
||||
^~hi~~~~ ^~lo~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
*/
|
||||
assert_eq!(
|
||||
lo,
|
||||
0b11111111_11111111_11111111_11111111_11111111_11111111_11111100_11100000u64
|
||||
);
|
||||
assert_eq!(hi, 0b00110001u64);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -348,5 +348,3 @@ macro_rules! assert_approx_eq {
|
|||
*a, *b, $eps, (*a - *b).abs());
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ pub use self::bmi::*;
|
|||
pub use self::bmi2::*;
|
||||
pub use self::tbm::*;
|
||||
|
||||
pub use self::runtime::{__Feature, __unstable_detect_feature};
|
||||
pub use self::runtime::{__unstable_detect_feature, __Feature};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub type __m128i = ::v128::i8x16;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
//! This module implements minimal run-time feature detection for x86.
|
||||
//!
|
||||
//! The features are detected using the `detect_features` function below. This function
|
||||
//! uses the CPUID instruction to read the feature flags from the CPU and encodes them in
|
||||
//! an `usize` where each bit position represents whether a feature is available (bit is set)
|
||||
//! or unavaiable (bit is cleared).
|
||||
//! The features are detected using the `detect_features` function below.
|
||||
//! This function uses the CPUID instruction to read the feature flags from the
|
||||
//! CPU and encodes them in an `usize` where each bit position represents
|
||||
//! whether a feature is available (bit is set) or unavaiable (bit is cleared).
|
||||
//!
|
||||
//! The enum `__Feature` is used to map bit positions to feature names, and the
|
||||
//! the `__unstable_detect_feature!` macro is used to map string literals (e.g.
|
||||
|
|
@ -12,10 +12,10 @@
|
|||
//!
|
||||
//! The run-time feature detection is performed by the
|
||||
//! `__unstable_detect_feature(__Feature) -> bool` function. On its first call,
|
||||
//! this functions queries the CPU for the available features and stores them in
|
||||
//! a global `AtomicUsize` variable. The query is performed by just checking whether the
|
||||
//! feature bit in this global variable is set or cleared.
|
||||
use ::std::sync::atomic::{AtomicUsize, Ordering};
|
||||
//! this functions queries the CPU for the available features and stores them
|
||||
//! in a global `AtomicUsize` variable. The query is performed by just checking
|
||||
//! whether the feature bit in this global variable is set or cleared.
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
/// This macro maps the string-literal feature names to values of the
|
||||
/// `__Feature` enum at compile-time. The feature names used are the same as
|
||||
|
|
@ -26,22 +26,68 @@ use ::std::sync::atomic::{AtomicUsize, Ordering};
|
|||
#[macro_export]
|
||||
#[doc(hidden)]
|
||||
macro_rules! __unstable_detect_feature {
|
||||
("sse") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse{}) };
|
||||
("sse2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse2{}) };
|
||||
("sse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse3{}) };
|
||||
("ssse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::ssse3{}) };
|
||||
("sse4.1") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_1{}) };
|
||||
("sse4.2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_2{}) };
|
||||
("avx") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx{}) };
|
||||
("avx2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx2{}) };
|
||||
("fma") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::fma{}) };
|
||||
("bmi") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi{}) };
|
||||
("bmi2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi2{}) };
|
||||
("abm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) };
|
||||
("lzcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) };
|
||||
("tbm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::tbm{}) };
|
||||
("popcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::popcnt{}) };
|
||||
($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) };
|
||||
("sse") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::sse{}) };
|
||||
("sse2") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::sse2{})
|
||||
};
|
||||
("sse3") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::sse3{})
|
||||
};
|
||||
("ssse3") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::ssse3{})
|
||||
};
|
||||
("sse4.1") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::sse4_1{})
|
||||
};
|
||||
("sse4.2") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::sse4_2{})
|
||||
};
|
||||
("avx") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::avx{})
|
||||
};
|
||||
("avx2") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::avx2{})
|
||||
};
|
||||
("fma") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::fma{})
|
||||
};
|
||||
("bmi") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::bmi{})
|
||||
};
|
||||
("bmi2") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::bmi2{})
|
||||
};
|
||||
("abm") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::abm{})
|
||||
};
|
||||
("lzcnt") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::abm{})
|
||||
};
|
||||
("tbm") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::tbm{})
|
||||
};
|
||||
("popcnt") => {
|
||||
$crate::vendor::__unstable_detect_feature(
|
||||
$crate::vendor::__Feature::popcnt{})
|
||||
};
|
||||
($t:tt) => {
|
||||
compile_error!(concat!("unknown target feature: ", $t))
|
||||
};
|
||||
}
|
||||
|
||||
/// X86 CPU Feature enum. Each variant denotes a position in a bitset for a
|
||||
|
|
@ -74,15 +120,15 @@ pub enum __Feature {
|
|||
bmi,
|
||||
/// BMI1 (Bit Manipulation Instructions 2)
|
||||
bmi2,
|
||||
/// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero Count) on Intel
|
||||
/// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero
|
||||
/// Count) on Intel
|
||||
abm,
|
||||
/// TBM (Trailing Bit Manipulation)
|
||||
tbm,
|
||||
/// POPCNT (Population Count)
|
||||
popcnt,
|
||||
|
||||
#[doc(hidden)]
|
||||
__NonExhaustive
|
||||
#[doc(hidden)] __NonExhaustive,
|
||||
}
|
||||
|
||||
fn set_bit(x: usize, bit: u32) -> usize {
|
||||
|
|
@ -102,14 +148,19 @@ fn inv_test_bit(v: usize, idx: u32) -> bool {
|
|||
|
||||
/// Run-time feature detection on x86 works by using the CPUID instruction.
|
||||
///
|
||||
/// The [CPUID Wikipedia page](https://en.wikipedia.org/wiki/CPUID) contains all
|
||||
/// the information about which flags to set to query which values, and in which
|
||||
/// registers these are reported.
|
||||
/// The [CPUID Wikipedia page][wiki_cpuid] contains
|
||||
/// all the information about which flags to set to query which values, and in
|
||||
/// which registers these are reported.
|
||||
///
|
||||
/// The definitive references are:
|
||||
/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
|
||||
/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf).
|
||||
/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
|
||||
/// Instruction Set Reference, A-Z][intel64_ref].
|
||||
/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
|
||||
/// System Instructions][amd64_ref].
|
||||
///
|
||||
/// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
|
||||
/// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||||
/// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
|
||||
fn detect_features() -> usize {
|
||||
let ebx;
|
||||
let ecx;
|
||||
|
|
@ -119,14 +170,16 @@ fn detect_features() -> usize {
|
|||
/// To obtain all feature flags we need two CPUID queries:
|
||||
|
||||
/// 1. EAX=1, ECX=0: Queries "Processor Info and Feature Bits"
|
||||
/// This gives us most of the CPU features in ECX and EDX (see below),
|
||||
/// This gives us most of the CPU features in ECX and EDX (see
|
||||
/// below),
|
||||
asm!("cpuid"
|
||||
: "={ecx}"(ecx), "={edx}"(edx)
|
||||
: "{eax}"(0x00000001u32), "{ecx}"(0 as u32)
|
||||
: :);
|
||||
|
||||
/// 2. EAX=7, ECX=0: Queries "Extended Features"
|
||||
/// This gives us information about bmi,bmi2, and avx2 support (see below).
|
||||
/// This gives us information about bmi,bmi2, and avx2 support
|
||||
/// (see below).
|
||||
asm!("cpuid"
|
||||
: "={ebx}"(ebx)
|
||||
: "{eax}"(0x00000007u32), "{ecx}"(0 as u32)
|
||||
|
|
@ -135,36 +188,65 @@ fn detect_features() -> usize {
|
|||
|
||||
let mut value: usize = 0;
|
||||
|
||||
// CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX (unneeded):
|
||||
if inv_test_bit(ebx, 3) { value = set_bit(value, __Feature::bmi as u32); }
|
||||
if inv_test_bit(ebx, 5) { value = set_bit(value, __Feature::avx2 as u32); }
|
||||
if inv_test_bit(ebx, 8) { value = set_bit(value, __Feature::bmi2 as u32); }
|
||||
// CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX
|
||||
// (the result in ECX is not currently needed):
|
||||
if inv_test_bit(ebx, 3) {
|
||||
value = set_bit(value, __Feature::bmi as u32);
|
||||
}
|
||||
if inv_test_bit(ebx, 5) {
|
||||
value = set_bit(value, __Feature::avx2 as u32);
|
||||
}
|
||||
if inv_test_bit(ebx, 8) {
|
||||
value = set_bit(value, __Feature::bmi2 as u32);
|
||||
}
|
||||
|
||||
// CPUID call with EAX=1 => feature bits in ECX and EDX:
|
||||
if inv_test_bit(ecx, 0) { value = set_bit(value, __Feature::sse3 as u32); }
|
||||
if inv_test_bit(ecx, 5) { value = set_bit(value, __Feature::abm as u32); }
|
||||
if inv_test_bit(ecx, 9) { value = set_bit(value, __Feature::ssse3 as u32); }
|
||||
if inv_test_bit(ecx, 12) { value = set_bit(value, __Feature::fma as u32); }
|
||||
if inv_test_bit(ecx, 19) { value = set_bit(value, __Feature::sse4_1 as u32); }
|
||||
if inv_test_bit(ecx, 20) { value = set_bit(value, __Feature::sse4_2 as u32); }
|
||||
if inv_test_bit(ecx, 21) { value = set_bit(value, __Feature::tbm as u32); }
|
||||
if inv_test_bit(ecx, 23) { value = set_bit(value, __Feature::popcnt as u32); }
|
||||
if inv_test_bit(ecx, 28) { value = set_bit(value, __Feature::avx as u32); }
|
||||
if inv_test_bit(ecx, 0) {
|
||||
value = set_bit(value, __Feature::sse3 as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 5) {
|
||||
value = set_bit(value, __Feature::abm as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 9) {
|
||||
value = set_bit(value, __Feature::ssse3 as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 12) {
|
||||
value = set_bit(value, __Feature::fma as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 19) {
|
||||
value = set_bit(value, __Feature::sse4_1 as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 20) {
|
||||
value = set_bit(value, __Feature::sse4_2 as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 21) {
|
||||
value = set_bit(value, __Feature::tbm as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 23) {
|
||||
value = set_bit(value, __Feature::popcnt as u32);
|
||||
}
|
||||
if inv_test_bit(ecx, 28) {
|
||||
value = set_bit(value, __Feature::avx as u32);
|
||||
}
|
||||
|
||||
if inv_test_bit(edx, 25) { value = set_bit(value, __Feature::sse as u32); }
|
||||
if inv_test_bit(edx, 26) { value = set_bit(value, __Feature::sse2 as u32); }
|
||||
if inv_test_bit(edx, 25) {
|
||||
value = set_bit(value, __Feature::sse as u32);
|
||||
}
|
||||
if inv_test_bit(edx, 26) {
|
||||
value = set_bit(value, __Feature::sse2 as u32);
|
||||
}
|
||||
|
||||
value
|
||||
}
|
||||
|
||||
/// This global variable is a bitset used to cache the features supported by the
|
||||
/// CPU.
|
||||
/// This global variable is a bitset used to cache the features supported by
|
||||
/// the CPU.
|
||||
static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX);
|
||||
|
||||
/// Performs run-time feature detection.
|
||||
///
|
||||
/// On its first invocation, it detects the CPU features and caches them in the
|
||||
/// `FEATURES` global variable as an `AtomicUsize`.
|
||||
/// On its first invocation, it detects the CPU features and caches them
|
||||
/// in the `FEATURES` global variable as an `AtomicUsize`.
|
||||
///
|
||||
/// It uses the `__Feature` variant to index into this variable as a bitset. If
|
||||
/// the bit is set, the feature is enabled, and otherwise it is disabled.
|
||||
|
|
@ -172,7 +254,7 @@ static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX);
|
|||
/// PLEASE: do not use this, it is an implementation detail subject to change.
|
||||
#[doc(hidden)]
|
||||
pub fn __unstable_detect_feature(x: __Feature) -> bool {
|
||||
if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX {
|
||||
if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX {
|
||||
FEATURES.store(detect_features(), Ordering::Relaxed);
|
||||
}
|
||||
test_bit(FEATURES.load(Ordering::Relaxed), x as u32)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -5,9 +5,8 @@ use std::mem;
|
|||
use std::os::raw::c_void;
|
||||
use std::ptr;
|
||||
|
||||
use simd_llvm::{
|
||||
simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
|
||||
};
|
||||
use simd_llvm::{simd_cast, simd_shuffle16, simd_shuffle2, simd_shuffle4,
|
||||
simd_shuffle8};
|
||||
use x86::__m128i;
|
||||
use v128::*;
|
||||
use v64::*;
|
||||
|
|
@ -317,7 +316,9 @@ pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
|
|||
#[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
|
||||
pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
|
||||
let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
|
||||
const fn sub(a: u32, b: u32) -> u32 { a - b }
|
||||
const fn sub(a: u32, b: u32) -> u32 {
|
||||
a - b
|
||||
}
|
||||
macro_rules! shuffle {
|
||||
($shift:expr) => {
|
||||
simd_shuffle16::<__m128i, __m128i>(zero, a, [
|
||||
|
|
@ -333,14 +334,22 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
|
|||
}
|
||||
}
|
||||
match imm8 {
|
||||
0 => shuffle!(0), 1 => shuffle!(1),
|
||||
2 => shuffle!(2), 3 => shuffle!(3),
|
||||
4 => shuffle!(4), 5 => shuffle!(5),
|
||||
6 => shuffle!(6), 7 => shuffle!(7),
|
||||
8 => shuffle!(8), 9 => shuffle!(9),
|
||||
10 => shuffle!(10), 11 => shuffle!(11),
|
||||
12 => shuffle!(12), 13 => shuffle!(13),
|
||||
14 => shuffle!(14), 15 => shuffle!(15),
|
||||
0 => shuffle!(0),
|
||||
1 => shuffle!(1),
|
||||
2 => shuffle!(2),
|
||||
3 => shuffle!(3),
|
||||
4 => shuffle!(4),
|
||||
5 => shuffle!(5),
|
||||
6 => shuffle!(6),
|
||||
7 => shuffle!(7),
|
||||
8 => shuffle!(8),
|
||||
9 => shuffle!(9),
|
||||
10 => shuffle!(10),
|
||||
11 => shuffle!(11),
|
||||
12 => shuffle!(12),
|
||||
13 => shuffle!(13),
|
||||
14 => shuffle!(14),
|
||||
15 => shuffle!(15),
|
||||
_ => shuffle!(16),
|
||||
}
|
||||
}
|
||||
|
|
@ -365,7 +374,7 @@ pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(psllw))]
|
||||
pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 {
|
||||
pub unsafe fn _mm_slli_epi16(a: i16x8, imm8: i32) -> i16x8 {
|
||||
pslliw(a, imm8)
|
||||
}
|
||||
|
||||
|
|
@ -454,7 +463,9 @@ pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
|
|||
#[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
|
||||
pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
|
||||
let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
|
||||
const fn add(a: u32, b: u32) -> u32 { a + b }
|
||||
const fn add(a: u32, b: u32) -> u32 {
|
||||
a + b
|
||||
}
|
||||
macro_rules! shuffle {
|
||||
($shift:expr) => {
|
||||
simd_shuffle16::<__m128i, __m128i>(a, zero, [
|
||||
|
|
@ -470,14 +481,22 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
|
|||
}
|
||||
}
|
||||
match imm8 {
|
||||
0 => shuffle!(0), 1 => shuffle!(1),
|
||||
2 => shuffle!(2), 3 => shuffle!(3),
|
||||
4 => shuffle!(4), 5 => shuffle!(5),
|
||||
6 => shuffle!(6), 7 => shuffle!(7),
|
||||
8 => shuffle!(8), 9 => shuffle!(9),
|
||||
10 => shuffle!(10), 11 => shuffle!(11),
|
||||
12 => shuffle!(12), 13 => shuffle!(13),
|
||||
14 => shuffle!(14), 15 => shuffle!(15),
|
||||
0 => shuffle!(0),
|
||||
1 => shuffle!(1),
|
||||
2 => shuffle!(2),
|
||||
3 => shuffle!(3),
|
||||
4 => shuffle!(4),
|
||||
5 => shuffle!(5),
|
||||
6 => shuffle!(6),
|
||||
7 => shuffle!(7),
|
||||
8 => shuffle!(8),
|
||||
9 => shuffle!(9),
|
||||
10 => shuffle!(10),
|
||||
11 => shuffle!(11),
|
||||
12 => shuffle!(12),
|
||||
13 => shuffle!(13),
|
||||
14 => shuffle!(14),
|
||||
15 => shuffle!(15),
|
||||
_ => shuffle!(16),
|
||||
}
|
||||
}
|
||||
|
|
@ -487,7 +506,7 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(psrlw))]
|
||||
pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 {
|
||||
pub unsafe fn _mm_srli_epi16(a: i16x8, imm8: i32) -> i16x8 {
|
||||
psrliw(a, imm8)
|
||||
}
|
||||
|
||||
|
|
@ -649,7 +668,7 @@ pub unsafe fn _mm_cmplt_epi32(a: i32x4, b: i32x4) -> i32x4 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvtdq2pd))]
|
||||
pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
|
||||
pub unsafe fn _mm_cvtepi32_pd(a: i32x4) -> f64x2 {
|
||||
simd_cast::<i32x2, f64x2>(simd_shuffle2(a, a, [0, 1]))
|
||||
}
|
||||
|
||||
|
|
@ -777,7 +796,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
|
|||
#[target_feature = "+sse2"]
|
||||
// no particular instruction to test
|
||||
pub unsafe fn _mm_set_epi16(
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
|
||||
) -> i16x8 {
|
||||
i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)
|
||||
}
|
||||
|
|
@ -790,6 +809,7 @@ pub unsafe fn _mm_set_epi8(
|
|||
e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
|
||||
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
|
||||
) -> i8x16 {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x16::new(
|
||||
e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
|
||||
)
|
||||
|
|
@ -840,7 +860,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> i32x4 {
|
|||
#[target_feature = "+sse2"]
|
||||
// no particular instruction to test
|
||||
pub unsafe fn _mm_setr_epi16(
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16,
|
||||
e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
|
||||
) -> i16x8 {
|
||||
i16x8::new(e7, e6, e5, e4, e3, e2, e1, e0)
|
||||
}
|
||||
|
|
@ -853,6 +873,7 @@ pub unsafe fn _mm_setr_epi8(
|
|||
e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
|
||||
e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
|
||||
) -> i8x16 {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x16::new(
|
||||
e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
|
||||
)
|
||||
|
|
@ -895,7 +916,8 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
|
|||
ptr::copy_nonoverlapping(
|
||||
mem_addr as *const u8,
|
||||
&mut dst as *mut __m128i as *mut u8,
|
||||
mem::size_of::<__m128i>());
|
||||
mem::size_of::<__m128i>(),
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
|
|
@ -934,7 +956,8 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
|
|||
ptr::copy_nonoverlapping(
|
||||
&a as *const _ as *const u8,
|
||||
mem_addr as *mut u8,
|
||||
mem::size_of::<__m128i>());
|
||||
mem::size_of::<__m128i>(),
|
||||
);
|
||||
}
|
||||
|
||||
/// Store the lower 64-bit integer `a` to a memory location.
|
||||
|
|
@ -945,7 +968,10 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
|
|||
// no particular instruction to test
|
||||
pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
|
||||
ptr::copy_nonoverlapping(
|
||||
&a as *const _ as *const u8, mem_addr as *mut u8, 8);
|
||||
&a as *const _ as *const u8,
|
||||
mem_addr as *mut u8,
|
||||
8,
|
||||
);
|
||||
}
|
||||
|
||||
/// Return a vector where the low element is extracted from `a` and its upper
|
||||
|
|
@ -1076,7 +1102,9 @@ pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
|
|||
pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
|
||||
// See _mm_shuffle_epi32.
|
||||
let imm8 = (imm8 & 0xFF) as u8;
|
||||
const fn add4(x: u32) -> u32 { x + 4 }
|
||||
const fn add4(x: u32) -> u32 {
|
||||
x + 4
|
||||
}
|
||||
|
||||
macro_rules! shuffle_done {
|
||||
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
|
||||
|
|
@ -1183,10 +1211,11 @@ pub unsafe fn _mm_shufflelo_epi16(a: i16x8, imm8: i32) -> i16x8 {
|
|||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(punpckhbw))]
|
||||
pub unsafe fn _mm_unpackhi_epi8(a: i8x16, b: i8x16) -> i8x16 {
|
||||
simd_shuffle16(a, b, [
|
||||
8, 24, 9, 25, 10, 26, 11, 27,
|
||||
12, 28, 13, 29, 14, 30, 15, 31,
|
||||
])
|
||||
simd_shuffle16(
|
||||
a,
|
||||
b,
|
||||
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
|
||||
)
|
||||
}
|
||||
|
||||
/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
|
||||
|
|
@ -1218,10 +1247,11 @@ pub unsafe fn _mm_unpackhi_epi64(a: i64x2, b: i64x2) -> i64x2 {
|
|||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(punpcklbw))]
|
||||
pub unsafe fn _mm_unpacklo_epi8(a: i8x16, b: i8x16) -> i8x16 {
|
||||
simd_shuffle16(a, b, [
|
||||
0, 16, 1, 17, 2, 18, 3, 19,
|
||||
4, 20, 5, 21, 6, 22, 7, 23,
|
||||
])
|
||||
simd_shuffle16(
|
||||
a,
|
||||
b,
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
|
||||
)
|
||||
}
|
||||
|
||||
/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
|
||||
|
|
@ -1718,7 +1748,8 @@ pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
|
|||
mem::transmute(ucomineqsd(a, b) as u8)
|
||||
}
|
||||
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in "a" to
|
||||
/// packed single-precision (32-bit) floating-point elements
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvtpd2ps))]
|
||||
|
|
@ -1726,8 +1757,8 @@ pub unsafe fn _mm_cvtpd_ps(a: f64x2) -> f32x4 {
|
|||
cvtpd2ps(a)
|
||||
}
|
||||
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in `a` to
|
||||
/// packed
|
||||
/// double-precision (64-bit) floating-point elements.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
|
|
@ -1736,7 +1767,8 @@ pub unsafe fn _mm_cvtps_pd(a: f32x4) -> f64x2 {
|
|||
cvtps2pd(a)
|
||||
}
|
||||
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers.
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in `a` to
|
||||
/// packed 32-bit integers.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvtpd2dq))]
|
||||
|
|
@ -1744,7 +1776,8 @@ pub unsafe fn _mm_cvtpd_epi32(a: f64x2) -> i32x4 {
|
|||
cvtpd2dq(a)
|
||||
}
|
||||
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer.
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in a to
|
||||
/// a 32-bit integer.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvtsd2si))]
|
||||
|
|
@ -1752,7 +1785,8 @@ pub unsafe fn _mm_cvtsd_si32(a: f64x2) -> i32 {
|
|||
cvtsd2si(a)
|
||||
}
|
||||
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer.
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in a to
|
||||
/// a 64-bit integer.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
|
|
@ -1761,9 +1795,10 @@ pub unsafe fn _mm_cvtsd_si64(a: f64x2) -> i64 {
|
|||
cvtsd2si64(a)
|
||||
}
|
||||
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in `b` to a
|
||||
/// single-precision (32-bit) floating-point element, store the result in the lower element
|
||||
/// of the return value, and copy the upper element from `a` to the upper element the return value.
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in `b`
|
||||
/// to a single-precision (32-bit) floating-point element, store the result in
|
||||
/// the lower element of the return value, and copy the upper element from `a`
|
||||
/// to the upper element the return value.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvtsd2ss))]
|
||||
|
|
@ -1771,18 +1806,19 @@ pub unsafe fn _mm_cvtsd_ss(a: f32x4, b: f64x2) -> f32x4 {
|
|||
cvtsd2ss(a, b)
|
||||
}
|
||||
|
||||
/// Convert the lower single-precision (32-bit) floating-point element in `b` to a
|
||||
/// double-precision (64-bit) floating-point element, store the result in the lower element
|
||||
/// of the return value, and copy the upper element from `a` to the upper element the return value.
|
||||
/// Convert the lower single-precision (32-bit) floating-point element in `b`
|
||||
/// to a double-precision (64-bit) floating-point element, store the result in
|
||||
/// the lower element of the return value, and copy the upper element from `a`
|
||||
/// to the upper element the return value.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvtss2sd))]
|
||||
pub unsafe fn _mm_cvtss_sd(a: f64x2, b: f32x4 ) -> f64x2 {
|
||||
pub unsafe fn _mm_cvtss_sd(a: f64x2, b: f32x4) -> f64x2 {
|
||||
cvtss2sd(a, b)
|
||||
}
|
||||
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
|
||||
/// 32-bit integers with truncation.
|
||||
/// Convert packed double-precision (64-bit) floating-point elements in `a` to
|
||||
/// packed 32-bit integers with truncation.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvttpd2dq))]
|
||||
|
|
@ -1790,8 +1826,8 @@ pub unsafe fn _mm_cvttpd_epi32(a: f64x2) -> i32x4 {
|
|||
cvttpd2dq(a)
|
||||
}
|
||||
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit integer
|
||||
/// with truncation.
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in `a`
|
||||
/// to a 32-bit integer with truncation.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvttsd2si))]
|
||||
|
|
@ -1799,8 +1835,8 @@ pub unsafe fn _mm_cvttsd_si32(a: f64x2) -> i32 {
|
|||
cvttsd2si(a)
|
||||
}
|
||||
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit integer
|
||||
/// with truncation.
|
||||
/// Convert the lower double-precision (64-bit) floating-point element in `a`
|
||||
/// to a 64-bit integer with truncation.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
|
|
@ -1809,8 +1845,8 @@ pub unsafe fn _mm_cvttsd_si64(a: f64x2) -> i64 {
|
|||
cvttsd2si64(a)
|
||||
}
|
||||
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit
|
||||
/// integers with truncation
|
||||
/// Convert packed single-precision (32-bit) floating-point elements in `a` to
|
||||
/// packed 32-bit integers with truncation.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(cvttps2dq))]
|
||||
|
|
@ -1818,45 +1854,48 @@ pub unsafe fn _mm_cvttps_epi32(a: f32x4) -> i32x4 {
|
|||
cvttps2dq(a)
|
||||
}
|
||||
|
||||
/// Copy double-precision (64-bit) floating-point element `a` to the lower element of the
|
||||
/// packed 64-bit return value
|
||||
/// Copy double-precision (64-bit) floating-point element `a` to the lower
|
||||
/// element of the packed 64-bit return value.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_set_sd(a: f64) -> f64x2 {
|
||||
f64x2::new(a, 0_f64)
|
||||
}
|
||||
|
||||
/// Broadcast double-precision (64-bit) floating-point value a to all elements of the return value
|
||||
/// Broadcast double-precision (64-bit) floating-point value a to all elements
|
||||
/// of the return value.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_set1_pd(a: f64) -> f64x2 {
|
||||
f64x2::new(a, a)
|
||||
}
|
||||
|
||||
/// Broadcast double-precision (64-bit) floating-point value a to all elements of the return value
|
||||
/// Broadcast double-precision (64-bit) floating-point value a to all elements
|
||||
/// of the return value.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_set_pd1(a: f64) -> f64x2 {
|
||||
f64x2::new(a, a)
|
||||
}
|
||||
|
||||
/// Set packed double-precision (64-bit) floating-point elements in the return value with the
|
||||
/// supplied values.
|
||||
/// Set packed double-precision (64-bit) floating-point elements in the return
|
||||
/// value with the supplied values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_set_pd(a: f64, b: f64) -> f64x2 {
|
||||
f64x2::new(b, a)
|
||||
}
|
||||
|
||||
/// Set packed double-precision (64-bit) floating-point elements in the return value with the
|
||||
/// supplied values in reverse order.
|
||||
/// Set packed double-precision (64-bit) floating-point elements in the return
|
||||
/// value with the supplied values in reverse order.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> f64x2 {
|
||||
f64x2::new(a, b)
|
||||
}
|
||||
|
||||
/// returns packed double-precision (64-bit) floating-point elements with all zeros.
|
||||
/// returns packed double-precision (64-bit) floating-point elements with all
|
||||
/// zeros.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_setzero_pd() -> f64x2 {
|
||||
|
|
@ -1876,9 +1915,10 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
|
|||
|
||||
|
||||
|
||||
/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
|
||||
/// from memory into the returned vector. mem_addr must be aligned on a 16-byte boundary or
|
||||
/// a general-protection exception may be generated.
|
||||
/// Load 128-bits (composed of 2 packed double-precision (64-bit)
|
||||
/// floating-point elements) from memory into the returned vector.
|
||||
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
|
||||
/// exception may be generated.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(movaps))]
|
||||
|
|
@ -1886,9 +1926,9 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
|
|||
*(mem_addr as *const f64x2)
|
||||
}
|
||||
|
||||
/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`
|
||||
/// into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
|
||||
/// may be generated.
|
||||
/// Store 128-bits (composed of 2 packed double-precision (64-bit)
|
||||
/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
|
||||
/// on a 16-byte boundary or a general-protection exception may be generated.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(movaps))]
|
||||
|
|
@ -1906,12 +1946,13 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: f64x2) {
|
|||
ptr::copy_nonoverlapping(
|
||||
&a as *const f64x2 as *const u8,
|
||||
mem_addr as *mut u8,
|
||||
mem::size_of::<f64x2>());
|
||||
mem::size_of::<f64x2>(),
|
||||
);
|
||||
}
|
||||
|
||||
/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
|
||||
/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
|
||||
/// exception may be generated.
|
||||
/// Store the lower double-precision (64-bit) floating-point element from `a`
|
||||
/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
|
||||
/// 16-byte boundary or a general-protection exception may be generated.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) {
|
||||
|
|
@ -1919,9 +1960,9 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) {
|
|||
*(mem_addr as *mut f64x2) = b;
|
||||
}
|
||||
|
||||
/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
|
||||
/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
|
||||
/// exception may be generated.
|
||||
/// Store the lower double-precision (64-bit) floating-point element from `a`
|
||||
/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
|
||||
/// 16-byte boundary or a general-protection exception may be generated.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) {
|
||||
|
|
@ -1929,8 +1970,10 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) {
|
|||
*(mem_addr as *mut f64x2) = b;
|
||||
}
|
||||
|
||||
/// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order.
|
||||
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
|
||||
/// Store 2 double-precision (64-bit) floating-point elements from `a` into
|
||||
/// memory in reverse order.
|
||||
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
|
||||
/// exception may be generated.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: f64x2) {
|
||||
|
|
@ -1956,9 +1999,9 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> f64x2 {
|
|||
f64x2::new(d, d)
|
||||
}
|
||||
|
||||
/// Load 2 double-precision (64-bit) floating-point elements from memory into the returned vector
|
||||
/// in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection
|
||||
/// exception may be generated.
|
||||
/// Load 2 double-precision (64-bit) floating-point elements from memory into
|
||||
/// the returned vector in reverse order. `mem_addr` must be aligned on a
|
||||
/// 16-byte boundary or a general-protection exception may be generated.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(movapd))]
|
||||
|
|
@ -1967,9 +2010,9 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> f64x2 {
|
|||
simd_shuffle2(a, a, [1, 0])
|
||||
}
|
||||
|
||||
/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
|
||||
/// from memory into the returned vector. mem_addr does not need to be aligned on any particular
|
||||
/// oundary.
|
||||
/// Load 128-bits (composed of 2 packed double-precision (64-bit)
|
||||
/// floating-point elements) from memory into the returned vector.
|
||||
/// `mem_addr` does not need to be aligned on any particular boundary.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse2"]
|
||||
#[cfg_attr(test, assert_instr(movups))]
|
||||
|
|
@ -1978,7 +2021,8 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> f64x2 {
|
|||
ptr::copy_nonoverlapping(
|
||||
mem_addr as *const u8,
|
||||
&mut dst as *mut f64x2 as *mut u8,
|
||||
mem::size_of::<f64x2>());
|
||||
mem::size_of::<f64x2>(),
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
|
|
@ -1997,7 +2041,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
|
|||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.sse2.pause"]
|
||||
fn pause();
|
||||
#[link_name = "llvm.x86.sse2.clflush"]
|
||||
|
|
@ -2145,7 +2189,7 @@ extern {
|
|||
#[link_name = "llvm.x86.sse2.cvtsd2ss"]
|
||||
fn cvtsd2ss(a: f32x4, b: f64x2) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse2.cvtss2sd"]
|
||||
fn cvtss2sd(a: f64x2, b: f32x4 ) -> f64x2;
|
||||
fn cvtss2sd(a: f64x2, b: f32x4) -> f64x2;
|
||||
#[link_name = "llvm.x86.sse2.cvttpd2dq"]
|
||||
fn cvttpd2dq(a: f64x2) -> i32x4;
|
||||
#[link_name = "llvm.x86.sse2.cvttsd2si"]
|
||||
|
|
@ -2160,7 +2204,7 @@ extern {
|
|||
mod tests {
|
||||
use std::os::raw::c_void;
|
||||
use stdsimd_test::simd_test;
|
||||
use test::black_box; // Used to inhibit constant-folding.
|
||||
use test::black_box; // Used to inhibit constant-folding.
|
||||
|
||||
use v128::*;
|
||||
use x86::{__m128i, sse2};
|
||||
|
|
@ -2188,13 +2232,17 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_add_epi8() {
|
||||
let a = i8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let r = sse2::_mm_add_epi8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x16::new(
|
||||
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
|
||||
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2235,13 +2283,17 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_adds_epi8() {
|
||||
let a = i8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let r = sse2::_mm_adds_epi8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x16::new(
|
||||
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
|
||||
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2288,13 +2340,17 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_adds_epu8() {
|
||||
let a = u8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = u8x16::new(
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let r = sse2::_mm_adds_epu8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = u8x16::new(
|
||||
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
|
||||
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2410,12 +2466,11 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_sad_epu8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = u8x16::new(
|
||||
255, 254, 253, 252, 1, 2, 3, 4,
|
||||
155, 154, 153, 152, 1, 2, 3, 4);
|
||||
let b = u8x16::new(
|
||||
0, 0, 0, 0, 2, 1, 2, 1,
|
||||
1, 1, 1, 1, 1, 2, 1, 2);
|
||||
255, 254, 253, 252, 1, 2, 3, 4, 155, 154, 153, 152, 1, 2, 3, 4,
|
||||
);
|
||||
let b = u8x16::new(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
|
||||
let r = sse2::_mm_sad_epu8(a, b);
|
||||
let e = u64x2::new(1020, 614);
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -2527,44 +2582,58 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_slli_si128() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_slli_si128(a, 1);
|
||||
let e = __m128i::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let e =
|
||||
__m128i::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
assert_eq!(r, e);
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_slli_si128(a, 15);
|
||||
let e = __m128i::new(
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
|
||||
let e = __m128i::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
|
||||
assert_eq!(r, e);
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_slli_si128(a, 16);
|
||||
assert_eq!(r, __m128i::splat(0));
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_slli_si128(a, -1);
|
||||
assert_eq!(r, __m128i::splat(0));
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_slli_si128(a, -0x80000000);
|
||||
assert_eq!(r, __m128i::splat(0));
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_slli_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i16x8::new(
|
||||
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
|
||||
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
|
||||
);
|
||||
let r = sse2::_mm_slli_epi16(a, 4);
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x8::new(
|
||||
0xFFF0 as u16 as i16,
|
||||
0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, 0, 0, 0, 0);
|
||||
0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
|
||||
0, 0, 0, 0,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2635,44 +2704,58 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_srli_si128() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_srli_si128(a, 1);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = __m128i::new(
|
||||
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
|
||||
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_srli_si128(a, 15);
|
||||
let e = __m128i::new(
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
let e = __m128i::new(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
assert_eq!(r, e);
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_srli_si128(a, 16);
|
||||
assert_eq!(r, __m128i::splat(0));
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_srli_si128(a, -1);
|
||||
assert_eq!(r, __m128i::splat(0));
|
||||
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = __m128i::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
);
|
||||
let r = sse2::_mm_srli_si128(a, -0x80000000);
|
||||
assert_eq!(r, __m128i::splat(0));
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_srli_epi16() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i16x8::new(
|
||||
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0);
|
||||
0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
|
||||
);
|
||||
let r = sse2::_mm_srli_epi16(a, 4);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i16x8::new(
|
||||
0xFFF as u16 as i16,
|
||||
0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0);
|
||||
0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2747,13 +2830,18 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_cmpeq_epi8() {
|
||||
let a = i8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let b = i8x16::new(
|
||||
15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let b =
|
||||
i8x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
let r = sse2::_mm_cmpeq_epi8(a, b);
|
||||
assert_eq!(r, i8x16::new(
|
||||
0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
|
||||
assert_eq!(
|
||||
r,
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x16::new(
|
||||
0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
|
|
@ -2902,18 +2990,12 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_set_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = sse2::_mm_set_epi8(
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
);
|
||||
let e = i8x16::new(
|
||||
15, 14, 13, 12,
|
||||
11, 10, 9, 8,
|
||||
7, 6, 5, 4,
|
||||
3, 2, 1, 0,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
let e =
|
||||
i8x16::new(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -2955,18 +3037,12 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_setr_epi8() {
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = sse2::_mm_setr_epi8(
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
);
|
||||
let e = i8x16::new(
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
);
|
||||
let e =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3042,9 +3118,13 @@ mod tests {
|
|||
let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0);
|
||||
let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80);
|
||||
let r = sse2::_mm_packs_epi16(a, b);
|
||||
assert_eq!(r, i8x16::new(
|
||||
0x7F, -0x80, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, -0x80, 0x7F));
|
||||
assert_eq!(
|
||||
r,
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
i8x16::new(
|
||||
0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
|
|
@ -3053,7 +3133,9 @@ mod tests {
|
|||
let b = i32x4::new(0, 0, -0x8001, 0x8000);
|
||||
let r = sse2::_mm_packs_epi32(a, b);
|
||||
assert_eq!(
|
||||
r, i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF));
|
||||
r,
|
||||
i16x8::new(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
|
|
@ -3061,9 +3143,10 @@ mod tests {
|
|||
let a = i16x8::new(0x100, -1, 0, 0, 0, 0, 0, 0);
|
||||
let b = i16x8::new(0, 0, 0, 0, 0, 0, -1, 0x100);
|
||||
let r = sse2::_mm_packus_epi16(a, b);
|
||||
assert_eq!(r, u8x16::new(
|
||||
0xFF, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0xFF));
|
||||
assert_eq!(
|
||||
r,
|
||||
u8x16::new(0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
|
|
@ -3082,9 +3165,9 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_movemask_epi8() {
|
||||
let a = i8x16::from(u8x16::new(
|
||||
let a = i8x16::from(#[cfg_attr(rustfmt, rustfmt_skip)] u8x16::new(
|
||||
0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0,
|
||||
0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000));
|
||||
0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000, ));
|
||||
let r = sse2::_mm_movemask_epi8(a);
|
||||
assert_eq!(r, 0b10100100_00100101);
|
||||
}
|
||||
|
|
@ -3115,13 +3198,17 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_unpackhi_epi8() {
|
||||
let a = i8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let r = sse2::_mm_unpackhi_epi8(a, b);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x16::new(
|
||||
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
||||
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3154,13 +3241,15 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_unpacklo_epi8() {
|
||||
let a = i8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let r = sse2::_mm_unpacklo_epi8(a, b);
|
||||
let e = i8x16::new(
|
||||
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||
let e =
|
||||
i8x16::new(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
@ -3825,7 +3914,7 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_cvtpd_ps() {
|
||||
use std::{f64,f32};
|
||||
use std::{f32, f64};
|
||||
|
||||
let r = sse2::_mm_cvtpd_ps(f64x2::new(-1.0, 5.0));
|
||||
assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, 0.0));
|
||||
|
|
@ -3834,20 +3923,23 @@ mod tests {
|
|||
assert_eq!(r, f32x4::new(-1.0, -5.0, 0.0, 0.0));
|
||||
|
||||
let r = sse2::_mm_cvtpd_ps(f64x2::new(f64::MAX, f64::MIN));
|
||||
assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, 0.0,0.0));
|
||||
assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
|
||||
|
||||
let r = sse2::_mm_cvtpd_ps(f64x2::new(f32::MAX as f64, f32::MIN as f64));
|
||||
assert_eq!(r, f32x4::new(f32::MAX, f32::MIN, 0.0,0.0));
|
||||
let r =
|
||||
sse2::_mm_cvtpd_ps(f64x2::new(f32::MAX as f64, f32::MIN as f64));
|
||||
assert_eq!(r, f32x4::new(f32::MAX, f32::MIN, 0.0, 0.0));
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_cvtps_pd() {
|
||||
use std::{f64, f32};
|
||||
use std::{f32, f64};
|
||||
|
||||
let r = sse2::_mm_cvtps_pd(f32x4::new(-1.0, 2.0, -3.0, 5.0));
|
||||
assert_eq!(r, f64x2::new(-1.0, 2.0));
|
||||
|
||||
let r = sse2::_mm_cvtps_pd(f32x4::new(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN));
|
||||
let r = sse2::_mm_cvtps_pd(
|
||||
f32x4::new(f32::MAX, f32::INFINITY, f32::NEG_INFINITY, f32::MIN),
|
||||
);
|
||||
assert_eq!(r, f64x2::new(f32::MAX as f64, f64::INFINITY));
|
||||
}
|
||||
|
||||
|
|
@ -3864,7 +3956,9 @@ mod tests {
|
|||
let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::MAX, f64::MIN));
|
||||
assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, 0, 0));
|
||||
|
||||
let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::INFINITY, f64::NEG_INFINITY));
|
||||
let r = sse2::_mm_cvtpd_epi32(
|
||||
f64x2::new(f64::INFINITY, f64::NEG_INFINITY),
|
||||
);
|
||||
assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, 0, 0));
|
||||
|
||||
let r = sse2::_mm_cvtpd_epi32(f64x2::new(f64::NAN, f64::NAN));
|
||||
|
|
@ -3902,7 +3996,7 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_cvtsd_ss() {
|
||||
use std::{f64, f32};
|
||||
use std::{f32, f64};
|
||||
|
||||
let a = f32x4::new(-1.1, -2.2, 3.3, 4.4);
|
||||
let b = f64x2::new(2.0, -5.0);
|
||||
|
|
@ -3911,17 +4005,26 @@ mod tests {
|
|||
|
||||
assert_eq!(r, f32x4::new(2.0, -2.2, 3.3, 4.4));
|
||||
|
||||
let a = f32x4::new(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
|
||||
let a =
|
||||
f32x4::new(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
|
||||
let b = f64x2::new(f64::INFINITY, -5.0);
|
||||
|
||||
let r = sse2::_mm_cvtsd_ss(a, b);
|
||||
|
||||
assert_eq!(r, f32x4::new(f32::INFINITY, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY));
|
||||
assert_eq!(
|
||||
r,
|
||||
f32x4::new(
|
||||
f32::INFINITY,
|
||||
f32::NEG_INFINITY,
|
||||
f32::MAX,
|
||||
f32::NEG_INFINITY
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "sse2"]
|
||||
unsafe fn _mm_cvtss_sd() {
|
||||
use std::{f64, f32};
|
||||
use std::{f32, f64};
|
||||
|
||||
let a = f64x2::new(-1.1, 2.2);
|
||||
let b = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
|
|
@ -3984,7 +4087,8 @@ mod tests {
|
|||
let r = sse2::_mm_cvttps_epi32(a);
|
||||
assert_eq!(r, i32x4::new(-1, 2, -3, 6));
|
||||
|
||||
let a = f32x4::new(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
|
||||
let a =
|
||||
f32x4::new(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
|
||||
let r = sse2::_mm_cvttps_epi32(a);
|
||||
assert_eq!(r, i32x4::new(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ pub unsafe fn _mm_moveldup_ps(a: f32x4) -> f32x4 {
|
|||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.sse3.addsub.ps"]
|
||||
fn addsubps(a: f32x4, b: f32x4) -> f32x4;
|
||||
#[link_name = "llvm.x86.sse3.addsub.pd"]
|
||||
|
|
@ -129,7 +129,7 @@ mod tests {
|
|||
use stdsimd_test::simd_test;
|
||||
|
||||
use v128::*;
|
||||
use x86::sse3 as sse3;
|
||||
use x86::sse3;
|
||||
|
||||
#[simd_test = "sse3"]
|
||||
unsafe fn _mm_addsub_ps() {
|
||||
|
|
@ -181,7 +181,8 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse3"]
|
||||
unsafe fn _mm_lddqu_si128() {
|
||||
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let a =
|
||||
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let r = sse3::_mm_lddqu_si128(&a);
|
||||
assert_eq!(a, r);
|
||||
}
|
||||
|
|
@ -213,4 +214,4 @@ mod tests {
|
|||
let r = sse3::_mm_loaddup_pd(&d);
|
||||
assert_eq!(r, f64x2::new(d, d));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
|
|||
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pblendw, imm8=0xF0))]
|
||||
#[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
|
||||
pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pblendw(a, b, $imm8) }
|
||||
|
|
@ -23,7 +23,8 @@ pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
|
|||
constify_imm8!(imm8, call)
|
||||
}
|
||||
|
||||
/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`
|
||||
/// Blend packed double-precision (64-bit) floating-point elements from `a`
|
||||
/// and `b` using `mask`
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(blendvpd))]
|
||||
|
|
@ -31,7 +32,8 @@ pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 {
|
|||
blendvpd(a, b, mask)
|
||||
}
|
||||
|
||||
/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`
|
||||
/// Blend packed single-precision (32-bit) floating-point elements from `a`
|
||||
/// and `b` using `mask`
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(blendvps))]
|
||||
|
|
@ -39,10 +41,11 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
|
|||
blendvps(a, b, mask)
|
||||
}
|
||||
|
||||
/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm2`
|
||||
/// Blend packed double-precision (64-bit) floating-point elements from `a`
|
||||
/// and `b` using control mask `imm2`
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(blendpd, imm2=0b10))]
|
||||
#[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
|
||||
pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
|
||||
macro_rules! call {
|
||||
($imm2:expr) => { blendpd(a, b, $imm2) }
|
||||
|
|
@ -50,10 +53,11 @@ pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
|
|||
constify_imm2!(imm2, call)
|
||||
}
|
||||
|
||||
/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using mask `imm4`
|
||||
/// Blend packed single-precision (32-bit) floating-point elements from `a`
|
||||
/// and `b` using mask `imm4`
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(blendps, imm4=0b0101))]
|
||||
#[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
|
||||
pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
|
||||
macro_rules! call {
|
||||
($imm4:expr) => { blendps(a, b, $imm4) }
|
||||
|
|
@ -61,11 +65,12 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
|
|||
constify_imm4!(imm4, call)
|
||||
}
|
||||
|
||||
/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
|
||||
/// Extract a single-precision (32-bit) floating-point element from `a`,
|
||||
/// selected with `imm8`
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
// TODO: Add test for Windows
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))]
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
|
||||
pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
|
||||
mem::transmute(a.extract(imm8 as u32 & 0b11))
|
||||
}
|
||||
|
|
@ -73,7 +78,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
|
|||
/// Extract an 8-bit integer from `a` selected with `imm8`
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pextrb, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
|
||||
pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
|
||||
a.extract((imm8 & 0b1111) as u32)
|
||||
}
|
||||
|
|
@ -82,7 +87,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
// TODO: Add test for Windows
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))]
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))]
|
||||
pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
|
||||
a.extract((imm8 & 0b11) as u32)
|
||||
}
|
||||
|
|
@ -92,15 +97,16 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
// TODO: Add test for Windows
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))]
|
||||
#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))]
|
||||
pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
|
||||
a.extract((imm8 & 0b1) as u32)
|
||||
}
|
||||
|
||||
/// Select a single value in `a` to store at some position in `b`,
|
||||
/// Select a single value in `a` to store at some position in `b`,
|
||||
/// Then zero elements according to `imm8`.
|
||||
///
|
||||
/// `imm8` specifies which bits from operand `a` will be copied, which bits in the
|
||||
///
|
||||
/// `imm8` specifies which bits from operand `a` will be copied, which bits in
|
||||
/// the
|
||||
/// result they will be copied to, and which bits in the result will be
|
||||
/// cleared. The following assignments are made:
|
||||
///
|
||||
|
|
@ -121,7 +127,7 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
|
|||
/// element is cleared.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(insertps, imm8=0b1010))]
|
||||
#[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
|
||||
pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { insertps(a, b, $imm8) }
|
||||
|
|
@ -129,59 +135,66 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
|
|||
constify_imm8!(imm8, call)
|
||||
}
|
||||
|
||||
/// Return a copy of `a` with the 8-bit integer from `i` inserted at a location specified by `imm8`.
|
||||
/// Return a copy of `a` with the 8-bit integer from `i` inserted at a
|
||||
/// location specified by `imm8`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pinsrb, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
|
||||
pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
|
||||
a.replace((imm8 & 0b1111) as u32, i)
|
||||
}
|
||||
|
||||
/// Return a copy of `a` with the 32-bit integer from `i` inserted at a location specified by `imm8`.
|
||||
/// Return a copy of `a` with the 32-bit integer from `i` inserted at a
|
||||
/// location specified by `imm8`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pinsrd, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
|
||||
pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 {
|
||||
a.replace((imm8 & 0b11) as u32, i)
|
||||
}
|
||||
|
||||
/// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`.
|
||||
/// Return a copy of `a` with the 64-bit integer from `i` inserted at a
|
||||
/// location specified by `imm8`.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pinsrq, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
|
||||
pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 {
|
||||
a.replace((imm8 & 0b1) as u32, i)
|
||||
}
|
||||
|
||||
/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum values in dst.
|
||||
/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum
|
||||
/// values in dst.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pmaxsb, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pmaxsb, imm8 = 0))]
|
||||
pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 {
|
||||
pmaxsb(a, b)
|
||||
}
|
||||
|
||||
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum.
|
||||
/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
|
||||
/// maximum.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pmaxuw, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pmaxuw, imm8 = 0))]
|
||||
pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
|
||||
pmaxuw(a, b)
|
||||
}
|
||||
|
||||
// Compare packed 32-bit integers in `a` and `b`, and return packed maximum values.
|
||||
// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
|
||||
// values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pmaxsd, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pmaxsd, imm8 = 0))]
|
||||
pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
|
||||
pmaxsd(a, b)
|
||||
}
|
||||
|
||||
// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
|
||||
// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
|
||||
// maximum values.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse4.1"]
|
||||
#[cfg_attr(test, assert_instr(pmaxud, imm8=0))]
|
||||
#[cfg_attr(test, assert_instr(pmaxud, imm8 = 0))]
|
||||
pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 {
|
||||
pmaxud(a, b)
|
||||
}
|
||||
|
|
@ -221,7 +234,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
|
|||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.sse41.pblendvb"]
|
||||
fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
|
||||
#[link_name = "llvm.x86.sse41.blendvpd"]
|
||||
|
|
@ -261,14 +274,18 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse4.1"]
|
||||
unsafe fn _mm_blendv_epi8() {
|
||||
let a = i8x16::new(
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
|
||||
let mask = i8x16::new(
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
);
|
||||
let mask =
|
||||
i8x16::new(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x16::new(
|
||||
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31);
|
||||
0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
|
||||
);
|
||||
assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
|
||||
}
|
||||
|
||||
|
|
@ -286,7 +303,7 @@ mod tests {
|
|||
unsafe fn _mm_blendv_ps() {
|
||||
let a = f32x4::splat(0.0);
|
||||
let b = f32x4::splat(1.0);
|
||||
let mask = mem::transmute(i32x4::new(0,-1, 0, -1));
|
||||
let mask = mem::transmute(i32x4::new(0, -1, 0, -1));
|
||||
let r = sse41::_mm_blendv_ps(a, b, mask);
|
||||
let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
|
||||
assert_eq!(r, e);
|
||||
|
|
@ -330,7 +347,8 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse4.1"]
|
||||
unsafe fn _mm_extract_epi8() {
|
||||
let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let a =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let r = sse41::_mm_extract_epi8(a, 1);
|
||||
assert_eq!(r, 1);
|
||||
let r = sse41::_mm_extract_epi8(a, 17);
|
||||
|
|
@ -398,10 +416,22 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse4.1"]
|
||||
unsafe fn _mm_max_epi8() {
|
||||
let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32);
|
||||
let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x16::new(
|
||||
1, 4, 5, 8, 9, 12, 13, 16,
|
||||
17, 20, 21, 24, 25, 28, 29, 32,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31,
|
||||
);
|
||||
let r = sse41::_mm_max_epi8(a, b);
|
||||
let e = i8x16::new(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let e = i8x16::new(
|
||||
2, 4, 6, 8, 10, 12, 14, 16,
|
||||
18, 20, 22, 24, 26, 28, 30, 32,
|
||||
);
|
||||
assert_eq!(r, e);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,8 @@ pub const _SIDD_SWORD_OPS: i8 = 0b00000011;
|
|||
|
||||
/// For each character in `a`, find if it is in `b` *(Default)*
|
||||
pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b00000000;
|
||||
/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
|
||||
/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <=
|
||||
/// b[2]...`
|
||||
pub const _SIDD_CMP_RANGES: i8 = 0b00000100;
|
||||
/// The strings defined by `a` and `b` are equal
|
||||
pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b00001000;
|
||||
|
|
@ -46,11 +47,7 @@ pub const _SIDD_UNIT_MASK: i8 = 0b01000000;
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistrm(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> u8x16 {
|
||||
pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistrm128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -58,9 +55,9 @@ pub unsafe fn _mm_cmpistrm(
|
|||
}
|
||||
|
||||
/// Compare packed strings with implicit lengths in `a` and `b` using the
|
||||
/// control in `imm8`, and return the generated index. Similar to [`_mm_cmpestri`]
|
||||
/// with the excception that [`_mm_cmpestri`] requires the lengths of `a` and
|
||||
/// `b` to be explicitly specified.
|
||||
/// control in `imm8`, and return the generated index. Similar to
|
||||
/// [`_mm_cmpestri`] with the excception that [`_mm_cmpestri`] requires the
|
||||
/// lengths of `a` and `b` to be explicitly specified.
|
||||
///
|
||||
/// # Control modes
|
||||
///
|
||||
|
|
@ -105,7 +102,8 @@ pub unsafe fn _mm_cmpistrm(
|
|||
/// use stdsimd::simd::u8x16;
|
||||
/// use stdsimd::vendor::{__m128i, _mm_cmpistri, _SIDD_CMP_EQUAL_ORDERED};
|
||||
///
|
||||
/// let haystack = b"This is a long string of text data\r\n\tthat extends multiple lines";
|
||||
/// let haystack = b"This is a long string of text data\r\n\tthat extends
|
||||
/// multiple lines";
|
||||
/// let needle = b"\r\n\t\0\0\0\0\0\0\0\0\0\0\0\0\0";
|
||||
///
|
||||
/// let a = __m128i::from(u8x16::load(needle, 0));
|
||||
|
|
@ -171,8 +169,8 @@ pub unsafe fn _mm_cmpistrm(
|
|||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Find the index of the first character in the haystack that is within a range
|
||||
/// of characters.
|
||||
/// Find the index of the first character in the haystack that is within a
|
||||
/// range of characters.
|
||||
///
|
||||
/// ```
|
||||
/// # #![feature(cfg_target_feature)]
|
||||
|
|
@ -269,11 +267,7 @@ pub unsafe fn _mm_cmpistrm(
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistri(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> i32 {
|
||||
pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i8) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistri128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -286,11 +280,7 @@ pub unsafe fn _mm_cmpistri(
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistrz(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> i32 {
|
||||
pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i8) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistriz128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -303,11 +293,7 @@ pub unsafe fn _mm_cmpistrz(
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistrc(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> i32 {
|
||||
pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i8) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistric128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -320,11 +306,7 @@ pub unsafe fn _mm_cmpistrc(
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistrs(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> i32 {
|
||||
pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i8) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistris128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -336,11 +318,7 @@ pub unsafe fn _mm_cmpistrs(
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistro(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> i32 {
|
||||
pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i8) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistrio128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -353,11 +331,7 @@ pub unsafe fn _mm_cmpistro(
|
|||
#[inline(always)]
|
||||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpistra(
|
||||
a: __m128i,
|
||||
b: __m128i,
|
||||
imm8: i8,
|
||||
) -> i32 {
|
||||
pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i8) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpistria128(a, b, $imm8) }
|
||||
}
|
||||
|
|
@ -370,11 +344,7 @@ pub unsafe fn _mm_cmpistra(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestrm(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> u8x16 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestrm128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -383,9 +353,9 @@ pub unsafe fn _mm_cmpestrm(
|
|||
}
|
||||
|
||||
/// Compare packed strings `a` and `b` with lengths `la` and `lb` using the
|
||||
/// control in `imm8`, and return the generated index. Similar to [`_mm_cmpistri`]
|
||||
/// with the excception that [`_mm_cmpistri`] implicityly determines the length of
|
||||
/// `a` and `b`.
|
||||
/// control in `imm8`, and return the generated index. Similar to
|
||||
/// [`_mm_cmpistri`] with the excception that [`_mm_cmpistri`] implicityly
|
||||
/// determines the length of `a` and `b`.
|
||||
///
|
||||
/// # Control modes
|
||||
///
|
||||
|
|
@ -468,11 +438,7 @@ pub unsafe fn _mm_cmpestrm(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestri(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestri128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -487,11 +453,7 @@ pub unsafe fn _mm_cmpestri(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestrz(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestriz128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -506,11 +468,7 @@ pub unsafe fn _mm_cmpestrz(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestrc(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestric128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -525,11 +483,7 @@ pub unsafe fn _mm_cmpestrc(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestrs(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestris128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -544,11 +498,7 @@ pub unsafe fn _mm_cmpestrs(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestro(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestrio128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -564,11 +514,7 @@ pub unsafe fn _mm_cmpestro(
|
|||
#[target_feature = "+sse4.2"]
|
||||
#[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
|
||||
pub unsafe fn _mm_cmpestra(
|
||||
a: __m128i,
|
||||
la: i32,
|
||||
b: __m128i,
|
||||
lb: i32,
|
||||
imm8: i8,
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32 {
|
||||
macro_rules! call {
|
||||
($imm8:expr) => { pcmpestria128(a, la, b, lb, $imm8) }
|
||||
|
|
@ -624,22 +570,35 @@ pub unsafe fn _mm_cmpgt_epi64(a: i64x2, b: i64x2) -> i64x2 {
|
|||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
extern "C" {
|
||||
// SSE 4.2 string and text comparison ops
|
||||
#[link_name = "llvm.x86.sse42.pcmpestrm128"]
|
||||
fn pcmpestrm128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> u8x16;
|
||||
fn pcmpestrm128(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> u8x16;
|
||||
#[link_name = "llvm.x86.sse42.pcmpestri128"]
|
||||
fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
|
||||
fn pcmpestri128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8)
|
||||
-> i32;
|
||||
#[link_name = "llvm.x86.sse42.pcmpestriz128"]
|
||||
fn pcmpestriz128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
|
||||
fn pcmpestriz128(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32;
|
||||
#[link_name = "llvm.x86.sse42.pcmpestric128"]
|
||||
fn pcmpestric128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
|
||||
fn pcmpestric128(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32;
|
||||
#[link_name = "llvm.x86.sse42.pcmpestris128"]
|
||||
fn pcmpestris128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
|
||||
fn pcmpestris128(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32;
|
||||
#[link_name = "llvm.x86.sse42.pcmpestrio128"]
|
||||
fn pcmpestrio128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
|
||||
fn pcmpestrio128(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32;
|
||||
#[link_name = "llvm.x86.sse42.pcmpestria128"]
|
||||
fn pcmpestria128(a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8) -> i32;
|
||||
fn pcmpestria128(
|
||||
a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
|
||||
) -> i32;
|
||||
#[link_name = "llvm.x86.sse42.pcmpistrm128"]
|
||||
fn pcmpistrm128(a: __m128i, b: __m128i, imm8: i8) -> u8x16;
|
||||
#[link_name = "llvm.x86.sse42.pcmpistri128"]
|
||||
|
|
@ -685,7 +644,8 @@ mod tests {
|
|||
ptr::copy_nonoverlapping(
|
||||
s.get_unchecked(0) as *const u8 as *const u8,
|
||||
slice.get_unchecked_mut(0) as *mut u8 as *mut u8,
|
||||
s.len());
|
||||
s.len(),
|
||||
);
|
||||
__m128i::from(u8x16::load(slice, 0))
|
||||
}
|
||||
|
||||
|
|
@ -694,8 +654,11 @@ mod tests {
|
|||
let a = str_to_m128i(b"Hello! Good-Bye!");
|
||||
let b = str_to_m128i(b"hello! good-bye!");
|
||||
let i = sse42::_mm_cmpistrm(a, b, sse42::_SIDD_UNIT_MASK);
|
||||
let res = u8x16::new(0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let res = u8x16::new(
|
||||
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff,
|
||||
);
|
||||
assert_eq!(i, res);
|
||||
}
|
||||
|
||||
|
|
@ -733,14 +696,23 @@ mod tests {
|
|||
|
||||
#[simd_test = "sse4.2"]
|
||||
unsafe fn _mm_cmpistro() {
|
||||
let a_bytes = u8x16::new(0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
|
||||
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
|
||||
let b_bytes = u8x16::new(0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
|
||||
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a_bytes = u8x16::new(
|
||||
0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
|
||||
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b_bytes = u8x16::new(
|
||||
0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
|
||||
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
);
|
||||
let a = __m128i::from(a_bytes);
|
||||
let b = __m128i::from(b_bytes);
|
||||
let i = sse42::_mm_cmpistro(
|
||||
a, b, sse42::_SIDD_UWORD_OPS | sse42::_SIDD_UNIT_MASK);
|
||||
a,
|
||||
b,
|
||||
sse42::_SIDD_UWORD_OPS | sse42::_SIDD_UNIT_MASK,
|
||||
);
|
||||
assert_eq!(0, i);
|
||||
}
|
||||
|
||||
|
|
@ -757,15 +729,20 @@ mod tests {
|
|||
let a = str_to_m128i(b"Hello!");
|
||||
let b = str_to_m128i(b"Hello.");
|
||||
let i = sse42::_mm_cmpestrm(a, 5, b, 5, sse42::_SIDD_UNIT_MASK);
|
||||
assert_eq!(i, u8x16::new(0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let r = u8x16::new(
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
);
|
||||
assert_eq!(i, r);
|
||||
}
|
||||
|
||||
#[simd_test = "sse4.2"]
|
||||
unsafe fn _mm_cmpestri() {
|
||||
let a = str_to_m128i(b"bar - garbage");
|
||||
let b = str_to_m128i(b"foobar");
|
||||
let i = sse42::_mm_cmpestri(a, 3, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
|
||||
let i =
|
||||
sse42::_mm_cmpestri(a, 3, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
|
||||
assert_eq!(3, i);
|
||||
}
|
||||
|
||||
|
|
@ -773,8 +750,8 @@ mod tests {
|
|||
unsafe fn _mm_cmpestrz() {
|
||||
let a = str_to_m128i(b"");
|
||||
let b = str_to_m128i(b"Hello");
|
||||
let i = sse42::_mm_cmpestrz(
|
||||
a, 16, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
|
||||
let i =
|
||||
sse42::_mm_cmpestrz(a, 16, b, 6, sse42::_SIDD_CMP_EQUAL_ORDERED);
|
||||
assert_eq!(1, i);
|
||||
}
|
||||
|
||||
|
|
@ -782,19 +759,20 @@ mod tests {
|
|||
unsafe fn _mm_cmpestrc() {
|
||||
let va = str_to_m128i(b"!!!!!!!!");
|
||||
let vb = str_to_m128i(b" ");
|
||||
let i = sse42::_mm_cmpestrc(
|
||||
va, 7, vb, 7, sse42::_SIDD_UNIT_MASK);
|
||||
let i = sse42::_mm_cmpestrc(va, 7, vb, 7, sse42::_SIDD_UNIT_MASK);
|
||||
assert_eq!(0, i);
|
||||
}
|
||||
|
||||
#[simd_test = "sse4.2"]
|
||||
unsafe fn _mm_cmpestrs() {
|
||||
let a_bytes = u8x16::new(0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
|
||||
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a_bytes = u8x16::new(
|
||||
0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c,
|
||||
0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
);
|
||||
let a = __m128i::from(a_bytes);
|
||||
let b = __m128i::from(u8x16::splat(0x00));
|
||||
let i = sse42::_mm_cmpestrs(
|
||||
a, 8, b, 0, sse42::_SIDD_UWORD_OPS);
|
||||
let i = sse42::_mm_cmpestrs(a, 8, b, 0, sse42::_SIDD_UWORD_OPS);
|
||||
assert_eq!(0, i);
|
||||
}
|
||||
|
||||
|
|
@ -802,8 +780,7 @@ mod tests {
|
|||
unsafe fn _mm_cmpestro() {
|
||||
let a = str_to_m128i(b"Hello");
|
||||
let b = str_to_m128i(b"World");
|
||||
let i = sse42::_mm_cmpestro(
|
||||
a, 5, b, 5, sse42::_SIDD_UBYTE_OPS);
|
||||
let i = sse42::_mm_cmpestro(a, 5, b, 5, sse42::_SIDD_UBYTE_OPS);
|
||||
assert_eq!(0, i);
|
||||
}
|
||||
|
||||
|
|
@ -812,7 +789,12 @@ mod tests {
|
|||
let a = str_to_m128i(b"Cannot match a");
|
||||
let b = str_to_m128i(b"Null after 14");
|
||||
let i = sse42::_mm_cmpestra(
|
||||
a, 14, b, 16, sse42::_SIDD_CMP_EQUAL_EACH | sse42::_SIDD_UNIT_MASK);
|
||||
a,
|
||||
14,
|
||||
b,
|
||||
16,
|
||||
sse42::_SIDD_CMP_EQUAL_EACH | sse42::_SIDD_UNIT_MASK,
|
||||
);
|
||||
assert_eq!(1, i);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ pub unsafe fn _mm_abs_epi8(a: i8x16) -> u8x16 {
|
|||
pabsb128(a)
|
||||
}
|
||||
|
||||
/// Compute the absolute value of each of the packed 16-bit signed integers in `a` and
|
||||
/// Compute the absolute value of each of the packed 16-bit signed integers in
|
||||
/// `a` and
|
||||
/// return the 16-bit unsigned integer
|
||||
#[inline(always)]
|
||||
#[target_feature = "+ssse3"]
|
||||
|
|
@ -22,7 +23,8 @@ pub unsafe fn _mm_abs_epi16(a: i16x8) -> u16x8 {
|
|||
pabsw128(a)
|
||||
}
|
||||
|
||||
/// Compute the absolute value of each of the packed 32-bit signed integers in `a` and
|
||||
/// Compute the absolute value of each of the packed 32-bit signed integers in
|
||||
/// `a` and
|
||||
/// return the 32-bit unsigned integer
|
||||
#[inline(always)]
|
||||
#[target_feature = "+ssse3"]
|
||||
|
|
@ -82,7 +84,9 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 {
|
|||
(a, b, n)
|
||||
};
|
||||
|
||||
const fn add(a: u32, b: u32) -> u32 { a + b }
|
||||
const fn add(a: u32, b: u32) -> u32 {
|
||||
a + b
|
||||
}
|
||||
macro_rules! shuffle {
|
||||
($shift:expr) => {
|
||||
simd_shuffle16(b, a, [
|
||||
|
|
@ -98,14 +102,22 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 {
|
|||
}
|
||||
}
|
||||
match n {
|
||||
0 => shuffle!(0), 1 => shuffle!(1),
|
||||
2 => shuffle!(2), 3 => shuffle!(3),
|
||||
4 => shuffle!(4), 5 => shuffle!(5),
|
||||
6 => shuffle!(6), 7 => shuffle!(7),
|
||||
8 => shuffle!(8), 9 => shuffle!(9),
|
||||
10 => shuffle!(10), 11 => shuffle!(11),
|
||||
12 => shuffle!(12), 13 => shuffle!(13),
|
||||
14 => shuffle!(14), 15 => shuffle!(15),
|
||||
0 => shuffle!(0),
|
||||
1 => shuffle!(1),
|
||||
2 => shuffle!(2),
|
||||
3 => shuffle!(3),
|
||||
4 => shuffle!(4),
|
||||
5 => shuffle!(5),
|
||||
6 => shuffle!(6),
|
||||
7 => shuffle!(7),
|
||||
8 => shuffle!(8),
|
||||
9 => shuffle!(9),
|
||||
10 => shuffle!(10),
|
||||
11 => shuffle!(11),
|
||||
12 => shuffle!(12),
|
||||
13 => shuffle!(13),
|
||||
14 => shuffle!(14),
|
||||
15 => shuffle!(15),
|
||||
_ => shuffle!(16),
|
||||
}
|
||||
}
|
||||
|
|
@ -223,7 +235,7 @@ pub unsafe fn _mm_sign_epi32(a: i32x4, b: i32x4) -> i32x4 {
|
|||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern {
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.ssse3.pabs.b.128"]
|
||||
fn pabsb128(a: i8x16) -> u8x16;
|
||||
|
||||
|
|
@ -275,7 +287,7 @@ mod tests {
|
|||
use stdsimd_test::simd_test;
|
||||
|
||||
use v128::*;
|
||||
use x86::ssse3 as ssse3;
|
||||
use x86::ssse3;
|
||||
|
||||
#[simd_test = "ssse3"]
|
||||
unsafe fn _mm_abs_epi8() {
|
||||
|
|
@ -297,44 +309,36 @@ mod tests {
|
|||
|
||||
#[simd_test = "ssse3"]
|
||||
unsafe fn _mm_shuffle_epi8() {
|
||||
let a = u8x16::new(
|
||||
1, 2, 3, 4,
|
||||
5, 6, 7, 8,
|
||||
9, 10, 11, 12,
|
||||
13, 14, 15, 16,
|
||||
);
|
||||
let b = u8x16::new(
|
||||
4, 128, 4, 3,
|
||||
24, 12, 6, 19,
|
||||
12, 5, 5, 10,
|
||||
4, 1, 8, 0,
|
||||
);
|
||||
let expected = u8x16::new(
|
||||
5, 0, 5, 4,
|
||||
9, 13, 7, 4,
|
||||
13, 6, 6, 11,
|
||||
5, 2, 9, 1,
|
||||
);
|
||||
let a =
|
||||
u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b =
|
||||
u8x16::new(4, 128, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let expected =
|
||||
u8x16::new(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
|
||||
let r = ssse3::_mm_shuffle_epi8(a, b);
|
||||
assert_eq!(r, expected);
|
||||
}
|
||||
|
||||
#[simd_test = "ssse3"]
|
||||
unsafe fn _mm_alignr_epi8() {
|
||||
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b = i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let a =
|
||||
i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b =
|
||||
i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let r = ssse3::_mm_alignr_epi8(a, b, 33);
|
||||
assert_eq!(r, i8x16::splat(0));
|
||||
|
||||
let r = ssse3::_mm_alignr_epi8(a, b, 17);
|
||||
let expected = i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
|
||||
let expected =
|
||||
i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0);
|
||||
assert_eq!(r, expected);
|
||||
|
||||
let r = ssse3::_mm_alignr_epi8(a, b, 16);
|
||||
assert_eq!(r, a);
|
||||
|
||||
let r = ssse3::_mm_alignr_epi8(a, b, 15);
|
||||
let expected = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
let expected =
|
||||
i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
assert_eq!(r, expected);
|
||||
|
||||
let r = ssse3::_mm_alignr_epi8(a, b, 0);
|
||||
|
|
@ -397,8 +401,10 @@ mod tests {
|
|||
|
||||
#[simd_test = "ssse3"]
|
||||
unsafe fn _mm_maddubs_epi16() {
|
||||
let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b = i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let a =
|
||||
u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||||
let b =
|
||||
i8x16::new(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
|
||||
let expected = i16x8::new(130, 24, 192, 194, 158, 175, 66, 120);
|
||||
let r = ssse3::_mm_maddubs_epi16(a, b);
|
||||
assert_eq!(r, expected);
|
||||
|
|
@ -415,9 +421,21 @@ mod tests {
|
|||
|
||||
#[simd_test = "ssse3"]
|
||||
unsafe fn _mm_sign_epi8() {
|
||||
let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -14, -15, 16);
|
||||
let b = i8x16::new(4, 63, -4, 3, 24, 12, -6, -19, 12, 5, -5, 10, 4, 1, -8, 0);
|
||||
let expected = i8x16::new(1, 2, -3, 4, 5, 6, -7, -8, 9, 10, -11, 12, 13, -14, 15, 0);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let a = i8x16::new(
|
||||
1, 2, 3, 4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 13, -14, -15, 16,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let b = i8x16::new(
|
||||
4, 63, -4, 3, 24, 12, -6, -19,
|
||||
12, 5, -5, 10, 4, 1, -8, 0,
|
||||
);
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
let expected = i8x16::new(
|
||||
1, 2, -3, 4, 5, 6, -7, -8,
|
||||
9, 10, -11, 12, 13, -14, 15, 0,
|
||||
);
|
||||
let r = ssse3::_mm_sign_epi8(a, b);
|
||||
assert_eq!(r, expected);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,16 +1,21 @@
|
|||
//! Trailing Bit Manipulation (TBM) instruction set.
|
||||
//!
|
||||
//! The reference is [AMD64 Architecture Programmer's Manual, Volume 3:
|
||||
//! General-Purpose and System
|
||||
//! Instructions](http://support.amd.com/TechDocs/24594.pdf).
|
||||
//! General-Purpose and System Instructions][amd64_ref].
|
||||
//!
|
||||
//! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_.28Trailing_Bit_Manipulation.29)
|
||||
//! provides a quick overview of the available instructions.
|
||||
//! [Wikipedia][wikipedia_bmi] provides a quick overview of the available
|
||||
//! instructions.
|
||||
//!
|
||||
//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
|
||||
//! [wikipedia_bmi]:
|
||||
//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
|
||||
//! 28Advanced_Bit_Manipulation.29
|
||||
|
||||
#[cfg(test)]
|
||||
use stdsimd_test::assert_instr;
|
||||
|
||||
// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: intrinsic %llvm.x86.tbm.bextri.u32
|
||||
// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select:
|
||||
// intrinsic %llvm.x86.tbm.bextri.u32
|
||||
/*
|
||||
#[allow(dead_code)]
|
||||
extern "C" {
|
||||
|
|
@ -39,8 +44,8 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
|
|||
/// Extracts bits of `a` specified by `control` into
|
||||
/// the least significant bits of the result.
|
||||
///
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
|
||||
/// extracted, and bits [15,8] specify the length of the range.
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to
|
||||
/// be extracted, and bits [15,8] specify the length of the range.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
|
||||
|
|
@ -50,8 +55,8 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
|
|||
/// Extracts bits of `a` specified by `control` into
|
||||
/// the least significant bits of the result.
|
||||
///
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to be
|
||||
/// extracted, and bits [15,8] specify the length of the range.
|
||||
/// Bits [7,0] of `control` specify the index to the first bit in the range to
|
||||
/// be extracted, and bits [15,8] specify the length of the range.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+tbm"]
|
||||
pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
|
||||
|
|
@ -122,7 +127,8 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
|
|||
!x & (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
/// Sets the least significant zero bit of `x` and clears all bits above that bit.
|
||||
/// Sets the least significant zero bit of `x` and clears all bits above
|
||||
/// that bit.
|
||||
///
|
||||
/// If there is no zero bit in `x`, it sets all the bits.
|
||||
#[inline(always)]
|
||||
|
|
@ -132,7 +138,8 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
|
|||
x ^ (x.wrapping_add(1))
|
||||
}
|
||||
|
||||
/// Sets the least significant zero bit of `x` and clears all bits above that bit.
|
||||
/// Sets the least significant zero bit of `x` and clears all bits above
|
||||
/// that bit.
|
||||
///
|
||||
/// If there is no zero bit in `x`, it sets all the bits.
|
||||
#[inline(always)]
|
||||
|
|
@ -272,162 +279,152 @@ mod tests {
|
|||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blcfill_u32() {
|
||||
assert_eq!(
|
||||
tbm::_blcfill_u32(0b0101_0111u32),
|
||||
0b0101_0000u32);
|
||||
assert_eq!(
|
||||
tbm::_blcfill_u32(0b1111_1111u32),
|
||||
0u32);
|
||||
assert_eq!(tbm::_blcfill_u32(0b0101_0111u32), 0b0101_0000u32);
|
||||
assert_eq!(tbm::_blcfill_u32(0b1111_1111u32), 0u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
unsafe fn _blcfill_u64() {
|
||||
assert_eq!(
|
||||
tbm::_blcfill_u64(0b0101_0111u64),
|
||||
0b0101_0000u64);
|
||||
assert_eq!(
|
||||
tbm::_blcfill_u64(0b1111_1111u64),
|
||||
0u64);
|
||||
assert_eq!(tbm::_blcfill_u64(0b0101_0111u64), 0b0101_0000u64);
|
||||
assert_eq!(tbm::_blcfill_u64(0b1111_1111u64), 0u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blci_u32() {
|
||||
assert_eq!(
|
||||
tbm::_blci_u32(0b0101_0000u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1110u32);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1110u32
|
||||
);
|
||||
assert_eq!(
|
||||
tbm::_blci_u32(0b1111_1111u32),
|
||||
0b1111_1111_1111_1111_1111_1110_1111_1111u32);
|
||||
0b1111_1111_1111_1111_1111_1110_1111_1111u32
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
unsafe fn _blci_u64() {
|
||||
assert_eq!(
|
||||
tbm::_blci_u64(0b0101_0000u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110u64
|
||||
);
|
||||
assert_eq!(
|
||||
tbm::_blci_u64(0b1111_1111u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1110_1111_1111u64
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blcic_u32() {
|
||||
assert_eq!(
|
||||
tbm::_blcic_u32(0b0101_0001u32),
|
||||
0b0000_0010u32);
|
||||
assert_eq!(
|
||||
tbm::_blcic_u32(0b1111_1111u32),
|
||||
0b1_0000_0000u32);
|
||||
assert_eq!(tbm::_blcic_u32(0b0101_0001u32), 0b0000_0010u32);
|
||||
assert_eq!(tbm::_blcic_u32(0b1111_1111u32), 0b1_0000_0000u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
unsafe fn _blcic_u64() {
|
||||
assert_eq!(
|
||||
tbm::_blcic_u64(0b0101_0001u64),
|
||||
0b0000_0010u64);
|
||||
assert_eq!(
|
||||
tbm::_blcic_u64(0b1111_1111u64),
|
||||
0b1_0000_0000u64);
|
||||
assert_eq!(tbm::_blcic_u64(0b0101_0001u64), 0b0000_0010u64);
|
||||
assert_eq!(tbm::_blcic_u64(0b1111_1111u64), 0b1_0000_0000u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blcmsk_u32() {
|
||||
assert_eq!(
|
||||
tbm::_blcmsk_u32(0b0101_0001u32),
|
||||
0b0000_0011u32);
|
||||
assert_eq!(
|
||||
tbm::_blcmsk_u32(0b1111_1111u32),
|
||||
0b1_1111_1111u32);
|
||||
assert_eq!(tbm::_blcmsk_u32(0b0101_0001u32), 0b0000_0011u32);
|
||||
assert_eq!(tbm::_blcmsk_u32(0b1111_1111u32), 0b1_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
unsafe fn _blcmsk_u64() {
|
||||
assert_eq!(
|
||||
tbm::_blcmsk_u64(0b0101_0001u64),
|
||||
0b0000_0011u64);
|
||||
assert_eq!(
|
||||
tbm::_blcmsk_u64(0b1111_1111u64),
|
||||
0b1_1111_1111u64);
|
||||
assert_eq!(tbm::_blcmsk_u64(0b0101_0001u64), 0b0000_0011u64);
|
||||
assert_eq!(tbm::_blcmsk_u64(0b1111_1111u64), 0b1_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blcs_u32() {
|
||||
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
|
||||
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
|
||||
assert_eq!(tbm::_blcs_u32(0b0101_0001u32), 0b0101_0011u32);
|
||||
assert_eq!(tbm::_blcs_u32(0b1111_1111u32), 0b1_1111_1111u32);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
unsafe fn _blcs_u64() {
|
||||
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
|
||||
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
|
||||
assert_eq!(tbm::_blcs_u64(0b0101_0001u64), 0b0101_0011u64);
|
||||
assert_eq!(tbm::_blcs_u64(0b1111_1111u64), 0b1_1111_1111u64);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blsfill_u32() {
|
||||
assert_eq!(
|
||||
tbm::_blsfill_u32(0b0101_0100u32),
|
||||
0b0101_0111u32);
|
||||
assert_eq!(tbm::_blsfill_u32(0b0101_0100u32), 0b0101_0111u32);
|
||||
assert_eq!(
|
||||
tbm::_blsfill_u32(0u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
unsafe fn _blsfill_u64() {
|
||||
assert_eq!(
|
||||
tbm::_blsfill_u64(0b0101_0100u64),
|
||||
0b0101_0111u64);
|
||||
assert_eq!(tbm::_blsfill_u64(0b0101_0100u64), 0b0101_0111u64);
|
||||
assert_eq!(
|
||||
tbm::_blsfill_u64(0u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _blsic_u32() {
|
||||
assert_eq!(
|
||||
tbm::_blsic_u32(0b0101_0100u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1011u32);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1011u32
|
||||
);
|
||||
assert_eq!(
|
||||
tbm::_blsic_u32(0u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
unsafe fn _blsic_u64() {
|
||||
assert_eq!(
|
||||
tbm::_blsic_u64(0b0101_0100u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64);
|
||||
assert_eq!(
|
||||
tbm::_blsic_u64(0u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1011u64
|
||||
);
|
||||
assert_eq!(
|
||||
tbm::_blsic_u64(0u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
unsafe fn _t1mskc_u32() {
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u32(0b0101_0111u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1000u32);
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u32(0u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32);
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u32(0b0101_0111u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1000u32
|
||||
);
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u32(0u32),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111u32
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
#[cfg(not(target_arch = "x86"))]
|
||||
#[cfg_attr(rustfmt, rustfmt_skip)]
|
||||
unsafe fn _t1mksc_u64() {
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u64(0b0101_0111u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64);
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u64(0u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64);
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u64(0b0101_0111u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1000u64
|
||||
);
|
||||
assert_eq!(
|
||||
tbm::_t1mskc_u64(0u64),
|
||||
0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111u64
|
||||
);
|
||||
}
|
||||
|
||||
#[simd_test = "tbm"]
|
||||
|
|
|
|||
|
|
@ -2,7 +2,10 @@ use std::env;
|
|||
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
let opt_level = env::var("OPT_LEVEL").ok().and_then(|s| s.parse().ok()).unwrap_or(0);
|
||||
let opt_level = env::var("OPT_LEVEL")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0);
|
||||
let profile = env::var("PROFILE").unwrap_or(String::new());
|
||||
if profile == "release" || opt_level >= 2 {
|
||||
println!("cargo:rustc-cfg=optimized");
|
||||
|
|
|
|||
|
|
@ -21,13 +21,13 @@ extern crate synom;
|
|||
use proc_macro2::TokenStream;
|
||||
|
||||
#[proc_macro_attribute]
|
||||
pub fn assert_instr(attr: proc_macro::TokenStream,
|
||||
item: proc_macro::TokenStream)
|
||||
-> proc_macro::TokenStream
|
||||
{
|
||||
pub fn assert_instr(
|
||||
attr: proc_macro::TokenStream, item: proc_macro::TokenStream
|
||||
) -> proc_macro::TokenStream {
|
||||
let invoc = syn::parse::<Invoc>(attr)
|
||||
.expect("expected #[assert_instr(instr, a = b, ...)]");
|
||||
let item = syn::parse::<syn::Item>(item).expect("must be attached to an item");
|
||||
let item =
|
||||
syn::parse::<syn::Item>(item).expect("must be attached to an item");
|
||||
let func = match item.node {
|
||||
syn::ItemKind::Fn(ref f) => f,
|
||||
_ => panic!("must be attached to a function"),
|
||||
|
|
@ -40,10 +40,11 @@ pub fn assert_instr(attr: proc_macro::TokenStream,
|
|||
(quote! { #[ignore] }).into()
|
||||
};
|
||||
let name = &func.ident;
|
||||
let assert_name = syn::Ident::from(&format!("assert_{}_{}",
|
||||
name.sym.as_str(),
|
||||
instr.sym.as_str())[..]);
|
||||
let shim_name = syn::Ident::from(&format!("{}_shim", name.sym.as_str())[..]);
|
||||
let assert_name = syn::Ident::from(
|
||||
&format!("assert_{}_{}", name.sym.as_str(), instr.sym.as_str())[..],
|
||||
);
|
||||
let shim_name =
|
||||
syn::Ident::from(&format!("{}_shim", name.sym.as_str())[..]);
|
||||
let (to_test, test_name) = if invoc.args.len() == 0 {
|
||||
(TokenStream::empty(), &func.ident)
|
||||
} else {
|
||||
|
|
@ -69,16 +70,29 @@ pub fn assert_instr(attr: proc_macro::TokenStream,
|
|||
}
|
||||
};
|
||||
}
|
||||
let attrs = item.attrs.iter().filter(|attr| {
|
||||
attr.path.segments.get(0).item().ident.sym.as_str().starts_with("target")
|
||||
}).collect::<Vec<_>>();
|
||||
let attrs = item.attrs
|
||||
.iter()
|
||||
.filter(|attr| {
|
||||
attr.path
|
||||
.segments
|
||||
.get(0)
|
||||
.item()
|
||||
.ident
|
||||
.sym
|
||||
.as_str()
|
||||
.starts_with("target")
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let attrs = Append(&attrs);
|
||||
(quote! {
|
||||
#attrs
|
||||
unsafe fn #shim_name(#(#inputs),*) #ret {
|
||||
#name(#(#input_vals),*)
|
||||
}
|
||||
}.into(), &shim_name)
|
||||
(
|
||||
quote! {
|
||||
#attrs
|
||||
unsafe fn #shim_name(#(#inputs),*) #ret {
|
||||
#name(#(#input_vals),*)
|
||||
}
|
||||
}.into(),
|
||||
&shim_name,
|
||||
)
|
||||
};
|
||||
|
||||
let tts: TokenStream = quote! {
|
||||
|
|
@ -128,8 +142,9 @@ impl synom::Synom for Invoc {
|
|||
struct Append<T>(T);
|
||||
|
||||
impl<T> quote::ToTokens for Append<T>
|
||||
where T: Clone + IntoIterator,
|
||||
T::Item: quote::ToTokens
|
||||
where
|
||||
T: Clone + IntoIterator,
|
||||
T::Item: quote::ToTokens,
|
||||
{
|
||||
fn to_tokens(&self, tokens: &mut quote::Tokens) {
|
||||
for item in self.0.clone() {
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
//! Implementation of the `#[simd_test]` macro
|
||||
//!
|
||||
//! This macro expands to a `#[test]` function which tests the local machine for
|
||||
//! the appropriate cfg before calling the inner test function.
|
||||
//! This macro expands to a `#[test]` function which tests the local machine
|
||||
//! for the appropriate cfg before calling the inner test function.
|
||||
|
||||
#![feature(proc_macro)]
|
||||
|
||||
extern crate proc_macro2;
|
||||
extern crate proc_macro;
|
||||
#[macro_use]
|
||||
extern crate quote;
|
||||
extern crate proc_macro;
|
||||
extern crate proc_macro2;
|
||||
|
||||
use proc_macro2::{TokenStream, Term, TokenNode, TokenTree};
|
||||
use proc_macro2::{Term, TokenNode, TokenStream, TokenTree};
|
||||
use proc_macro2::Literal;
|
||||
|
||||
fn string(s: &str) -> TokenTree {
|
||||
|
|
@ -22,8 +22,9 @@ fn string(s: &str) -> TokenTree {
|
|||
}
|
||||
|
||||
#[proc_macro_attribute]
|
||||
pub fn simd_test(attr: proc_macro::TokenStream,
|
||||
item: proc_macro::TokenStream) -> proc_macro::TokenStream {
|
||||
pub fn simd_test(
|
||||
attr: proc_macro::TokenStream, item: proc_macro::TokenStream
|
||||
) -> proc_macro::TokenStream {
|
||||
let tokens = TokenStream::from(attr).into_iter().collect::<Vec<_>>();
|
||||
if tokens.len() != 2 {
|
||||
panic!("expected #[simd_test = \"feature\"]");
|
||||
|
|
@ -37,8 +38,9 @@ pub fn simd_test(attr: proc_macro::TokenStream,
|
|||
TokenNode::Literal(ref l) => l.to_string(),
|
||||
_ => panic!("expected #[simd_test = \"feature\"]"),
|
||||
};
|
||||
let enable_feature = enable_feature.trim_left_matches('"')
|
||||
.trim_right_matches('"');
|
||||
let enable_feature = enable_feature
|
||||
.trim_left_matches('"')
|
||||
.trim_right_matches('"');
|
||||
let enable_feature = string(&format!("+{}", enable_feature));
|
||||
let item = TokenStream::from(item);
|
||||
let name = find_name(item.clone());
|
||||
|
|
@ -67,7 +69,7 @@ fn find_name(item: TokenStream) -> Term {
|
|||
while let Some(tok) = tokens.next() {
|
||||
if let TokenNode::Term(word) = tok.kind {
|
||||
if word.as_str() == "fn" {
|
||||
break
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,12 +7,12 @@
|
|||
#![feature(proc_macro)]
|
||||
|
||||
extern crate assert_instr_macro;
|
||||
extern crate simd_test_macro;
|
||||
extern crate backtrace;
|
||||
extern crate cc;
|
||||
extern crate rustc_demangle;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate rustc_demangle;
|
||||
extern crate simd_test_macro;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
|
|
@ -23,7 +23,8 @@ pub use assert_instr_macro::*;
|
|||
pub use simd_test_macro::*;
|
||||
|
||||
lazy_static! {
|
||||
static ref DISASSEMBLY: HashMap<String, Vec<Function>> = disassemble_myself();
|
||||
static ref DISASSEMBLY: HashMap<String, Vec<Function>>
|
||||
= disassemble_myself();
|
||||
}
|
||||
|
||||
struct Function {
|
||||
|
|
@ -37,14 +38,22 @@ struct Instruction {
|
|||
fn disassemble_myself() -> HashMap<String, Vec<Function>> {
|
||||
let me = env::current_exe().expect("failed to get current exe");
|
||||
|
||||
if cfg!(target_arch = "x86_64") &&
|
||||
cfg!(target_os = "windows") &&
|
||||
cfg!(target_env = "msvc") {
|
||||
let mut cmd = cc::windows_registry::find("x86_64-pc-windows-msvc", "dumpbin.exe")
|
||||
.expect("failed to find `dumpbin` tool");
|
||||
let output = cmd.arg("/DISASM").arg(&me).output()
|
||||
if cfg!(target_arch = "x86_64") && cfg!(target_os = "windows")
|
||||
&& cfg!(target_env = "msvc")
|
||||
{
|
||||
let mut cmd = cc::windows_registry::find(
|
||||
"x86_64-pc-windows-msvc",
|
||||
"dumpbin.exe",
|
||||
).expect("failed to find `dumpbin` tool");
|
||||
let output = cmd.arg("/DISASM")
|
||||
.arg(&me)
|
||||
.output()
|
||||
.expect("failed to execute dumpbin");
|
||||
println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
|
||||
println!(
|
||||
"{}\n{}",
|
||||
output.status,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
assert!(output.status.success());
|
||||
parse_dumpbin(&String::from_utf8_lossy(&output.stdout))
|
||||
} else if cfg!(target_os = "windows") {
|
||||
|
|
@ -55,7 +64,11 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
|
|||
.arg(&me)
|
||||
.output()
|
||||
.expect("failed to execute otool");
|
||||
println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
|
||||
println!(
|
||||
"{}\n{}",
|
||||
output.status,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
assert!(output.status.success());
|
||||
|
||||
parse_otool(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
|
||||
|
|
@ -66,10 +79,16 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
|
|||
.arg(&me)
|
||||
.output()
|
||||
.expect("failed to execute objdump");
|
||||
println!("{}\n{}", output.status, String::from_utf8_lossy(&output.stderr));
|
||||
println!(
|
||||
"{}\n{}",
|
||||
output.status,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
assert!(output.status.success());
|
||||
|
||||
parse_objdump(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
|
||||
parse_objdump(
|
||||
&str::from_utf8(&output.stdout).expect("stdout not utf8"),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -91,7 +110,7 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
while let Some(header) = lines.next() {
|
||||
// symbols should start with `$hex_addr <$name>:`
|
||||
if !header.ends_with(">:") {
|
||||
continue
|
||||
continue;
|
||||
}
|
||||
let start = header.find("<").unwrap();
|
||||
let symbol = &header[start + 1..header.len() - 2];
|
||||
|
|
@ -99,15 +118,17 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
let mut instructions = Vec::new();
|
||||
while let Some(instruction) = lines.next() {
|
||||
if instruction.is_empty() {
|
||||
break
|
||||
break;
|
||||
}
|
||||
// Each line of instructions should look like:
|
||||
//
|
||||
// $rel_offset: ab cd ef 00 $instruction...
|
||||
let parts = instruction.split_whitespace()
|
||||
let parts = instruction
|
||||
.split_whitespace()
|
||||
.skip(1)
|
||||
.skip_while(|s| {
|
||||
s.len() == expected_len && usize::from_str_radix(s, 16).is_ok()
|
||||
s.len() == expected_len
|
||||
&& usize::from_str_radix(s, 16).is_ok()
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>();
|
||||
|
|
@ -116,10 +137,12 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
|
||||
ret.entry(normalize(symbol))
|
||||
.or_insert(Vec::new())
|
||||
.push(Function { instrs: instructions });
|
||||
.push(Function {
|
||||
instrs: instructions,
|
||||
});
|
||||
}
|
||||
|
||||
return ret
|
||||
return ret;
|
||||
}
|
||||
|
||||
fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
|
||||
|
|
@ -138,7 +161,7 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
};
|
||||
// symbols should start with `$symbol:`
|
||||
if !header.ends_with(":") {
|
||||
continue
|
||||
continue;
|
||||
}
|
||||
// strip the leading underscore and the trailing colon
|
||||
let symbol = &header[1..header.len() - 1];
|
||||
|
|
@ -147,12 +170,13 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
while let Some(instruction) = lines.next() {
|
||||
if instruction.ends_with(":") {
|
||||
cached_header = Some(instruction);
|
||||
break
|
||||
break;
|
||||
}
|
||||
// Each line of instructions should look like:
|
||||
//
|
||||
// $addr $instruction...
|
||||
let parts = instruction.split_whitespace()
|
||||
let parts = instruction
|
||||
.split_whitespace()
|
||||
.skip(1)
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>();
|
||||
|
|
@ -161,10 +185,12 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
|
||||
ret.entry(normalize(symbol))
|
||||
.or_insert(Vec::new())
|
||||
.push(Function { instrs: instructions });
|
||||
.push(Function {
|
||||
instrs: instructions,
|
||||
});
|
||||
}
|
||||
|
||||
return ret
|
||||
return ret;
|
||||
}
|
||||
|
||||
fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
|
||||
|
|
@ -183,7 +209,7 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
};
|
||||
// symbols should start with `$symbol:`
|
||||
if !header.ends_with(":") {
|
||||
continue
|
||||
continue;
|
||||
}
|
||||
// strip the trailing colon
|
||||
let symbol = &header[..header.len() - 1];
|
||||
|
|
@ -192,20 +218,21 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
while let Some(instruction) = lines.next() {
|
||||
if !instruction.starts_with(" ") {
|
||||
cached_header = Some(instruction);
|
||||
break
|
||||
break;
|
||||
}
|
||||
// Each line looks like:
|
||||
//
|
||||
// > $addr: ab cd ef $instr..
|
||||
// > 00 12 # this line os optional
|
||||
if instruction.starts_with(" ") {
|
||||
continue
|
||||
continue;
|
||||
}
|
||||
let parts = instruction.split_whitespace()
|
||||
let parts = instruction
|
||||
.split_whitespace()
|
||||
.skip(1)
|
||||
.skip_while(|s| {
|
||||
s.len() == 2 && usize::from_str_radix(s, 16).is_ok()
|
||||
})
|
||||
.skip_while(
|
||||
|s| s.len() == 2 && usize::from_str_radix(s, 16).is_ok(),
|
||||
)
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<String>>();
|
||||
instructions.push(Instruction { parts });
|
||||
|
|
@ -213,10 +240,12 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
|
|||
|
||||
ret.entry(normalize(symbol))
|
||||
.or_insert(Vec::new())
|
||||
.push(Function { instrs: instructions });
|
||||
.push(Function {
|
||||
instrs: instructions,
|
||||
});
|
||||
}
|
||||
|
||||
return ret
|
||||
return ret;
|
||||
}
|
||||
|
||||
fn normalize(symbol: &str) -> String {
|
||||
|
|
@ -266,7 +295,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
|
|||
// instruction: tzcntl => tzcnt and compares that.
|
||||
if part.starts_with(expected) {
|
||||
found = true;
|
||||
break
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -274,7 +303,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
|
|||
let probably_only_one_instruction = function.instrs.len() < 30;
|
||||
|
||||
if found && probably_only_one_instruction {
|
||||
return
|
||||
return;
|
||||
}
|
||||
|
||||
// Help debug by printing out the found disassembly, and then panic as we
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#![cfg_attr(feature = "strict", deny(warnings))]
|
||||
#![feature(cfg_target_feature)]
|
||||
|
||||
extern crate cupid;
|
||||
#[macro_use]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
extern crate stdsimd;
|
||||
extern crate cupid;
|
||||
|
||||
#[test]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue